Preprocessing and Exploratory Data Analysis

a) Missing values

train <- read.table("../data/rawdata/adult.data.txt", sep = ",", na.strings = "?",
                    strip.white = T)
test <- read.table("../data/rawdata/adult.test.txt", sep = ",", na.strings = "?",
                   strip.white = T)

dim(train)
## [1] 32561    15
dim(test)
## [1] 16281    15
colnames(train) <- c("age", "workclass", "fnlwgt", "education", "education-num",
                     "marital-status", "occupation", "relationship", "race", "sex",
                     "capital-gain", "capital-loss", "hours-per-week", "native-country", "income")

colnames(test) <- c("age", "workclass", "fnlwgt", "education", "education-num",
                     "marital-status", "occupation", "relationship", "race", "sex",
                     "capital-gain", "capital-loss", "hours-per-week", "native-country", "income")



#Find missing values and NAs for training set.
for(i in 1:ncol(train)){
  cat("<names of NA rows in", colnames(train)[i], "variable>", "\n")
  cat(rownames(train)[is.na(train[, i])], "\n")
  cat("Number of NA values:  ", length(rownames(train)[is.na(train[, i])]), "\n")
  print("======================================")
  print("======================================")
  
  cat("<names of rows contain missing values in", colnames(train)[i], "variable>", "\n")
  cat(rownames(train[which(train[, i] == ""), ]), "\n")
  cat("Number of Missing values :  ", length(rownames(train[which(train[, i] == ""), ])), "\n")
  print("======================================")
  print("======================================")
  
  cat("<names of rows contain ? values in", colnames(train)[i], "variable>", "\n")
  cat(rownames(train[which(train[, i] == " ?"), ]), "\n")
  cat("Number of ? values :  ", length(rownames(train[which(train[, i] == " ?"), ])), "\n")
  print("======================================")
  print("======================================")
}
## <names of NA rows in age variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in age variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in age variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in workclass variable> 
## 28 62 70 78 107 129 150 155 161 188 202 222 227 244 267 298 313 327 347 348 355 398 409 431 432 450 460 472 485 487 500 512 516 518 519 540 577 581 591 592 597 649 657 669 672 687 696 735 789 790 793 807 812 831 835 886 891 904 925 932 970 983 1020 1035 1036 1040 1047 1098 1101 1132 1135 1153 1168 1176 1181 1186 1209 1216 1218 1263 1283 1291 1313 1318 1326 1348 1350 1372 1389 1405 1421 1434 1442 1453 1459 1505 1545 1563 1570 1571 1575 1580 1593 1607 1630 1657 1666 1677 1705 1708 1759 1762 1774 1779 1824 1847 1852 1866 1879 1924 1932 1972 1988 2026 2037 2047 2062 2073 2085 2092 2095 2106 2119 2127 2153 2156 2164 2165 2211 2214 2223 2282 2293 2324 2328 2341 2355 2357 2359 2360 2373 2381 2383 2398 2421 2428 2465 2477 2487 2492 2497 2507 2514 2522 2545 2567 2571 2572 2579 2587 2595 2607 2633 2635 2639 2674 2677 2690 2752 2761 2762 2848 2857 2858 2859 2886 2931 2933 2948 2953 2962 3000 3006 3034 3043 3066 3073 3089 3097 3120 3132 3147 3208 3212 3229 3232 3240 3256 3270 3292 3298 3331 3339 3352 3372 3388 3403 3440 3454 3457 3460 3487 3517 3532 3556 3573 3574 3580 3589 3593 3595 3598 3632 3671 3704 3726 3737 3745 3748 3760 3774 3776 3806 3823 3835 3844 3852 3864 3888 3896 3898 3902 3903 3917 3943 3948 3950 3951 3964 3970 3981 3991 4004 4018 4019 4022 4073 4081 4087 4091 4110 4148 4153 4156 4170 4175 4201 4213 4216 4218 4241 4273 4289 4300 4310 4316 4338 4370 4394 4410 4414 4423 4436 4438 4461 4465 4500 4501 4520 4532 4549 4554 4593 4607 4608 4614 4622 4628 4656 4686 4689 4722 4730 4746 4753 4756 4767 4779 4783 4802 4813 4819 4836 4839 4885 4926 4943 4959 4971 4981 4983 5021 5065 5148 5157 5173 5194 5199 5208 5210 5215 5229 5256 5295 5297 5303 5308 5321 5322 5341 5345 5347 5384 5385 5441 5447 5472 5493 5527 5530 5549 5562 5566 5591 5624 5633 5653 5680 5688 5721 5754 5767 5789 5804 5809 5833 5854 5917 5922 5929 5978 5984 6016 6040 6052 6060 6132 6179 6232 6285 6286 6315 6343 6352 6408 6433 6449 6511 6522 6537 6543 6550 6559 6565 6592 6641 6647 6664 6680 6734 6735 6754 6767 6799 6835 6861 6863 6878 6897 6915 6936 6949 6994 6996 7012 7028 7050 7076 7101 7103 7107 7137 7150 7165 7168 7174 7194 7292 7302 7323 7341 7353 7438 7458 7464 7473 7511 7555 7560 7561 7577 7580 7585 7612 7664 7684 7725 7741 7747 7751 7764 7774 7785 7788 7816 7827 7840 7863 7873 7877 7901 7906 7943 7964 7972 7978 8000 8003 8008 8023 8043 8054 8058 8070 8086 8089 8097 8099 8101 8135 8148 8169 8190 8194 8223 8242 8298 8323 8365 8388 8430 8447 8448 8473 8500 8533 8544 8566 8608 8637 8644 8674 8693 8695 8750 8758 8765 8770 8783 8789 8796 8806 8823 8848 8854 8909 8921 8941 8950 8955 8964 8986 8992 8997 9029 9031 9107 9139 9141 9142 9148 9149 9156 9171 9179 9198 9212 9215 9246 9294 9325 9341 9343 9352 9354 9359 9368 9410 9453 9478 9485 9491 9501 9532 9537 9550 9558 9578 9583 9617 9627 9651 9704 9709 9713 9779 9788 9861 9873 9886 9908 9927 9928 9939 9988 10014 10016 10017 10036 10057 10065 10095 10099 10103 10111 10118 10127 10140 10144 10162 10223 10233 10254 10329 10343 10362 10412 10426 10438 10441 10461 10476 10486 10487 10540 10547 10571 10582 10674 10680 10682 10684 10685 10701 10705 10710 10716 10719 10746 10747 10785 10806 10821 10829 10838 10857 10882 10932 10933 10956 10960 10996 11002 11028 11040 11047 11057 11060 11086 11088 11100 11159 11160 11165 11193 11199 11218 11229 11235 11262 11287 11295 11317 11329 11335 11341 11347 11356 11392 11414 11422 11431 11461 11475 11485 11501 11517 11527 11533 11545 11551 11574 11579 11581 11592 11615 11622 11659 11689 11692 11714 11732 11733 11735 11769 11771 11774 11794 11852 11865 11939 12008 12009 12021 12030 12069 12094 12099 12131 12154 12162 12176 12199 12214 12215 12219 12254 12300 12327 12335 12352 12374 12378 12406 12412 12429 12439 12452 12492 12493 12544 12554 12590 12610 12628 12652 12668 12785 12795 12850 12854 12859 12909 12919 12920 12937 12982 12992 12997 13026 13027 13043 13044 13066 13070 13075 13096 13111 13154 13162 13178 13182 13183 13204 13235 13291 13303 13322 13348 13366 13371 13385 13448 13494 13499 13505 13516 13526 13535 13553 13557 13570 13587 13609 13627 13647 13712 13745 13785 13816 13844 13846 13885 13934 13950 14006 14054 14068 14109 14119 14124 14152 14179 14204 14214 14218 14255 14256 14281 14331 14345 14349 14361 14364 14371 14399 14419 14431 14442 14500 14535 14536 14537 14542 14549 14572 14575 14579 14603 14619 14647 14673 14689 14693 14718 14719 14726 14743 14747 14860 14861 14871 14888 14912 14940 14946 14981 14983 15013 15023 15034 15065 15070 15131 15177 15193 15221 15239 15257 15267 15287 15293 15310 15311 15351 15415 15425 15427 15465 15472 15477 15485 15500 15524 15533 15543 15548 15580 15581 15585 15597 15599 15617 15644 15675 15686 15697 15744 15774 15779 15783 15847 15861 15872 15912 15961 16000 16005 16020 16064 16066 16083 16104 16118 16124 16131 16137 16147 16152 16156 16174 16180 16186 16197 16213 16222 16293 16295 16347 16380 16383 16400 16405 16411 16455 16457 16489 16491 16516 16524 16536 16567 16584 16596 16603 16643 16660 16680 16726 16732 16744 16749 16751 16756 16761 16763 16795 16799 16803 16811 16818 16828 16836 16839 16879 16908 16967 16979 16985 17016 17031 17040 17097 17099 17133 17169 17210 17248 17280 17300 17315 17322 17327 17348 17392 17413 17415 17463 17471 17506 17532 17538 17588 17595 17636 17644 17645 17649 17709 17711 17718 17724 17726 17751 17753 17758 17763 17774 17791 17812 17838 17877 17883 17903 17906 18009 18011 18020 18037 18057 18095 18162 18165 18182 18195 18202 18218 18219 18233 18237 18245 18258 18260 18295 18323 18332 18338 18343 18357 18359 18363 18385 18387 18410 18467 18471 18497 18535 18542 18561 18565 18578 18600 18601 18605 18616 18623 18656 18721 18731 18751 18754 18795 18806 18847 18913 18924 18925 18932 18935 18943 18953 18965 18990 18993 19042 19059 19074 19091 19134 19135 19154 19169 19181 19231 19234 19241 19254 19256 19285 19312 19319 19321 19338 19346 19434 19439 19456 19462 19463 19493 19510 19545 19547 19549 19562 19617 19621 19658 19707 19709 19765 19776 19787 19789 19813 19815 19820 19821 19831 19843 19858 19890 19897 19987 20004 20008 20010 20018 20024 20030 20032 20039 20065 20069 20073 20095 20100 20106 20160 20162 20188 20192 20206 20228 20267 20270 20272 20292 20303 20315 20322 20334 20397 20400 20435 20437 20475 20481 20528 20531 20545 20552 20564 20577 20596 20609 20613 20647 20657 20665 20687 20725 20758 20774 20776 20777 20783 20798 20804 20826 20827 20830 20869 20878 20881 20916 20931 20939 20942 20954 20964 21040 21097 21115 21126 21143 21147 21153 21159 21177 21180 21243 21244 21273 21275 21290 21349 21357 21395 21411 21414 21423 21429 21438 21454 21456 21466 21484 21488 21492 21517 21525 21529 21538 21546 21550 21587 21627 21632 21649 21667 21687 21699 21726 21747 21763 21800 21804 21806 21813 21830 21848 21852 21858 21862 21893 21915 21920 21947 22007 22023 22034 22043 22061 22076 22095 22102 22110 22131 22166 22174 22226 22255 22282 22349 22351 22374 22379 22380 22388 22397 22406 22423 22496 22503 22511 22546 22557 22634 22645 22732 22752 22758 22787 22796 22799 22821 22834 22842 22849 22886 22899 22911 22929 22952 22977 22984 22998 23010 23020 23063 23100 23110 23124 23137 23177 23193 23210 23229 23238 23268 23282 23316 23331 23334 23337 23343 23352 23354 23374 23376 23389 23396 23416 23422 23503 23536 23537 23539 23545 23552 23593 23637 23671 23712 23730 23745 23756 23760 23794 23819 23823 23826 23854 23866 23881 23916 23919 23921 23937 23955 23981 24004 24016 24048 24054 24055 24074 24085 24110 24134 24150 24156 24184 24196 24233 24239 24242 24248 24300 24309 24319 24380 24387 24406 24430 24462 24476 24486 24528 24533 24580 24581 24637 24645 24687 24695 24705 24715 24736 24753 24761 24774 24780 24782 24788 24797 24810 24814 24822 24829 24876 24877 24895 24903 24914 24950 24999 25010 25035 25045 25054 25057 25061 25068 25077 25100 25123 25135 25164 25187 25209 25217 25226 25239 25267 25271 25296 25304 25322 25360 25398 25418 25425 25431 25442 25458 25520 25523 25525 25538 25569 25579 25589 25637 25670 25686 25704 25727 25749 25756 25773 25803 25807 25814 25820 25827 25836 25847 25854 25878 25903 25929 25930 25931 25977 25981 26013 26041 26052 26082 26094 26120 26128 26138 26144 26146 26164 26226 26245 26260 26288 26296 26297 26313 26332 26349 26364 26374 26376 26411 26417 26422 26451 26490 26504 26513 26514 26523 26550 26567 26587 26599 26600 26682 26688 26689 26777 26792 26839 26867 26925 26929 26959 26983 26987 26993 26999 27010 27019 27023 27039 27052 27086 27087 27101 27108 27141 27145 27180 27202 27226 27266 27268 27318 27322 27332 27351 27383 27396 27427 27453 27467 27476 27499 27508 27514 27519 27523 27550 27564 27570 27579 27580 27591 27596 27614 27643 27658 27666 27677 27699 27703 27708 27724 27731 27771 27775 27809 27845 27854 27899 27916 27936 27945 27972 27994 28010 28021 28038 28044 28113 28139 28144 28145 28146 28149 28165 28176 28195 28207 28211 28242 28272 28277 28294 28304 28321 28345 28366 28383 28385 28395 28426 28477 28479 28504 28544 28549 28587 28604 28630 28687 28689 28716 28774 28784 28803 28812 28850 28856 28860 28886 28892 28905 28919 28923 28934 28944 28954 28958 28962 29027 29041 29062 29063 29073 29095 29098 29102 29120 29129 29178 29207 29223 29260 29298 29311 29319 29325 29326 29341 29360 29361 29376 29392 29398 29422 29437 29448 29456 29466 29481 29528 29583 29587 29602 29615 29663 29694 29699 29711 29725 29738 29753 29792 29799 29808 29815 29820 29836 29843 29882 29903 29957 29968 30043 30060 30062 30069 30103 30157 30159 30164 30207 30209 30210 30219 30269 30278 30296 30314 30335 30370 30385 30398 30403 30413 30445 30451 30457 30469 30514 30559 30570 30585 30616 30624 30625 30629 30642 30661 30674 30678 30688 30694 30699 30708 30712 30714 30724 30728 30745 30758 30777 30782 30785 30823 30828 30831 30898 30931 30948 30978 31019 31033 31037 31044 31062 31063 31074 31095 31102 31109 31119 31124 31179 31194 31202 31221 31236 31248 31254 31274 31277 31280 31308 31314 31361 31373 31391 31422 31433 31528 31537 31541 31569 31578 31592 31595 31606 31622 31634 31636 31662 31665 31669 31697 31699 31711 31712 31724 31732 31740 31752 31754 31759 31766 31773 31776 31791 31793 31794 31811 31822 31837 31863 31872 31886 31909 31913 31914 31998 31999 32009 32017 32040 32063 32071 32074 32076 32081 32084 32089 32094 32104 32190 32202 32207 32276 32292 32311 32318 32336 32344 32427 32478 32491 32495 32526 32531 32532 32540 32542 32543 
## Number of NA values:   1836 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in workclass variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in workclass variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in fnlwgt variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in fnlwgt variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in fnlwgt variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in education variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in education variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in education variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in education-num variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in education-num variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in education-num variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in marital-status variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in marital-status variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in marital-status variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in occupation variable> 
## 28 62 70 78 107 129 150 155 161 188 202 222 227 244 267 298 313 327 347 348 355 398 409 431 432 450 460 472 485 487 500 512 516 518 519 540 577 581 591 592 597 649 657 669 672 687 696 735 789 790 793 807 812 831 835 886 891 904 925 932 970 983 1020 1035 1036 1040 1047 1098 1101 1132 1135 1153 1168 1176 1181 1186 1209 1216 1218 1263 1283 1291 1313 1318 1326 1348 1350 1372 1389 1405 1421 1434 1442 1453 1459 1505 1545 1563 1570 1571 1575 1580 1593 1607 1630 1657 1666 1677 1705 1708 1759 1762 1774 1779 1824 1847 1852 1866 1879 1924 1932 1972 1988 2026 2037 2047 2062 2073 2085 2092 2095 2106 2119 2127 2153 2156 2164 2165 2211 2214 2223 2282 2293 2324 2328 2341 2355 2357 2359 2360 2373 2381 2383 2398 2421 2428 2465 2477 2487 2492 2497 2507 2514 2522 2545 2567 2571 2572 2579 2587 2595 2607 2633 2635 2639 2674 2677 2690 2752 2761 2762 2848 2857 2858 2859 2886 2931 2933 2948 2953 2962 3000 3006 3034 3043 3066 3073 3089 3097 3120 3132 3147 3208 3212 3229 3232 3240 3256 3270 3292 3298 3331 3339 3352 3372 3388 3403 3440 3454 3457 3460 3487 3517 3532 3556 3573 3574 3580 3589 3593 3595 3598 3632 3671 3704 3726 3737 3745 3748 3760 3774 3776 3806 3823 3835 3844 3852 3864 3888 3896 3898 3902 3903 3917 3943 3948 3950 3951 3964 3970 3981 3991 4004 4018 4019 4022 4073 4081 4087 4091 4110 4148 4153 4156 4170 4175 4201 4213 4216 4218 4241 4273 4289 4300 4310 4316 4338 4370 4394 4410 4414 4423 4436 4438 4461 4465 4500 4501 4520 4532 4549 4554 4593 4607 4608 4614 4622 4628 4656 4686 4689 4722 4730 4746 4753 4756 4767 4779 4783 4802 4813 4819 4836 4839 4885 4926 4943 4959 4971 4981 4983 5021 5065 5148 5157 5173 5194 5199 5208 5210 5215 5229 5256 5295 5297 5303 5308 5321 5322 5341 5345 5347 5362 5384 5385 5441 5447 5472 5493 5527 5530 5549 5562 5566 5591 5624 5633 5653 5680 5688 5721 5754 5767 5789 5804 5809 5833 5854 5917 5922 5929 5978 5984 6016 6040 6052 6060 6132 6179 6232 6285 6286 6315 6343 6352 6408 6433 6449 6511 6522 6537 6543 6550 6559 6565 6592 6641 6647 6664 6680 6734 6735 6754 6767 6799 6835 6861 6863 6878 6897 6915 6936 6949 6994 6996 7012 7028 7050 7076 7101 7103 7107 7137 7150 7165 7168 7174 7194 7292 7302 7323 7341 7353 7438 7458 7464 7473 7511 7555 7560 7561 7577 7580 7585 7612 7664 7684 7725 7741 7747 7751 7764 7774 7785 7788 7816 7827 7840 7863 7873 7877 7901 7906 7943 7964 7972 7978 8000 8003 8008 8023 8043 8054 8058 8070 8086 8089 8097 8099 8101 8135 8148 8169 8190 8194 8223 8242 8298 8323 8365 8388 8430 8447 8448 8473 8500 8533 8544 8566 8608 8637 8644 8674 8693 8695 8750 8758 8765 8770 8783 8789 8796 8806 8823 8848 8854 8909 8921 8941 8950 8955 8964 8986 8992 8997 9029 9031 9107 9139 9141 9142 9148 9149 9156 9171 9179 9198 9212 9215 9246 9294 9325 9341 9343 9352 9354 9359 9368 9410 9453 9478 9485 9491 9501 9532 9537 9550 9558 9578 9583 9617 9627 9651 9704 9709 9713 9779 9788 9861 9873 9886 9908 9927 9928 9939 9988 10014 10016 10017 10036 10057 10065 10095 10099 10103 10111 10118 10127 10140 10144 10162 10223 10233 10254 10329 10343 10362 10412 10426 10438 10441 10461 10476 10486 10487 10540 10547 10571 10582 10674 10680 10682 10684 10685 10701 10705 10710 10716 10719 10746 10747 10785 10806 10821 10829 10838 10846 10857 10882 10932 10933 10956 10960 10996 11002 11028 11040 11047 11057 11060 11086 11088 11100 11159 11160 11165 11193 11199 11218 11229 11235 11262 11287 11295 11317 11329 11335 11341 11347 11356 11392 11414 11422 11431 11461 11475 11485 11501 11517 11527 11533 11545 11551 11574 11579 11581 11592 11615 11622 11659 11689 11692 11714 11732 11733 11735 11769 11771 11774 11794 11852 11865 11939 12008 12009 12021 12030 12069 12094 12099 12131 12154 12162 12176 12199 12214 12215 12219 12254 12300 12327 12335 12352 12374 12378 12406 12412 12429 12439 12452 12492 12493 12544 12554 12590 12610 12628 12652 12668 12785 12795 12850 12854 12859 12909 12919 12920 12937 12982 12992 12997 13026 13027 13043 13044 13066 13070 13075 13096 13111 13154 13162 13178 13182 13183 13204 13235 13291 13303 13322 13348 13366 13371 13385 13448 13494 13499 13505 13516 13526 13535 13553 13557 13570 13587 13609 13627 13647 13712 13745 13785 13816 13844 13846 13885 13934 13950 14006 14054 14068 14109 14119 14124 14152 14179 14204 14214 14218 14255 14256 14281 14331 14345 14349 14361 14364 14371 14399 14419 14431 14442 14500 14535 14536 14537 14542 14549 14572 14575 14579 14603 14619 14647 14673 14689 14693 14718 14719 14726 14743 14747 14773 14860 14861 14871 14888 14912 14940 14946 14981 14983 15013 15023 15034 15065 15070 15131 15177 15193 15221 15239 15257 15267 15287 15293 15310 15311 15351 15415 15425 15427 15465 15472 15477 15485 15500 15524 15533 15543 15548 15580 15581 15585 15597 15599 15617 15644 15675 15686 15697 15744 15774 15779 15783 15847 15861 15872 15912 15961 16000 16005 16020 16064 16066 16083 16104 16118 16124 16131 16137 16147 16152 16156 16174 16180 16186 16197 16213 16222 16293 16295 16347 16380 16383 16400 16405 16411 16455 16457 16489 16491 16516 16524 16536 16567 16584 16596 16603 16643 16660 16680 16726 16732 16744 16749 16751 16756 16761 16763 16795 16799 16803 16811 16818 16828 16836 16839 16879 16908 16967 16979 16985 17016 17031 17040 17097 17099 17133 17169 17210 17248 17280 17300 17315 17322 17327 17348 17392 17413 17415 17463 17471 17506 17532 17538 17588 17595 17636 17644 17645 17649 17709 17711 17718 17724 17726 17751 17753 17758 17763 17774 17791 17812 17838 17877 17883 17903 17906 18009 18011 18020 18037 18057 18095 18162 18165 18182 18195 18202 18218 18219 18233 18237 18245 18258 18260 18295 18323 18332 18338 18343 18357 18359 18363 18385 18387 18410 18467 18471 18497 18535 18542 18561 18565 18578 18600 18601 18605 18616 18623 18656 18721 18731 18751 18754 18795 18806 18847 18913 18924 18925 18932 18935 18943 18953 18965 18990 18993 19042 19059 19074 19091 19134 19135 19154 19169 19181 19231 19234 19241 19254 19256 19285 19312 19319 19321 19338 19346 19434 19439 19456 19462 19463 19493 19510 19545 19547 19549 19562 19617 19621 19658 19707 19709 19765 19776 19787 19789 19813 19815 19820 19821 19831 19843 19858 19890 19897 19987 20004 20008 20010 20018 20024 20030 20032 20039 20065 20069 20073 20095 20100 20106 20160 20162 20188 20192 20206 20228 20267 20270 20272 20292 20303 20315 20322 20334 20338 20397 20400 20435 20437 20475 20481 20528 20531 20545 20552 20564 20577 20596 20609 20613 20647 20657 20665 20687 20725 20758 20774 20776 20777 20783 20798 20804 20826 20827 20830 20869 20878 20881 20916 20931 20939 20942 20954 20964 21040 21097 21115 21126 21143 21147 21153 21159 21177 21180 21243 21244 21273 21275 21290 21349 21357 21395 21411 21414 21423 21429 21438 21454 21456 21466 21484 21488 21492 21517 21525 21529 21538 21546 21550 21587 21627 21632 21649 21667 21687 21699 21726 21747 21763 21800 21804 21806 21813 21830 21848 21852 21858 21862 21893 21915 21920 21947 22007 22023 22034 22043 22061 22076 22095 22102 22110 22131 22166 22174 22226 22255 22282 22349 22351 22374 22379 22380 22388 22397 22406 22423 22496 22503 22511 22546 22557 22634 22645 22732 22752 22758 22787 22796 22799 22821 22834 22842 22849 22886 22899 22911 22929 22952 22977 22984 22998 23010 23020 23063 23100 23110 23124 23137 23177 23193 23210 23229 23233 23238 23268 23282 23316 23331 23334 23337 23343 23352 23354 23374 23376 23389 23396 23416 23422 23503 23536 23537 23539 23545 23552 23593 23637 23671 23712 23730 23745 23756 23760 23794 23819 23823 23826 23854 23866 23881 23916 23919 23921 23937 23955 23981 24004 24016 24048 24054 24055 24074 24085 24110 24134 24150 24156 24184 24196 24233 24239 24242 24248 24300 24309 24319 24380 24387 24406 24430 24462 24476 24486 24528 24533 24580 24581 24637 24645 24687 24695 24705 24715 24736 24753 24761 24774 24780 24782 24788 24797 24810 24814 24822 24829 24876 24877 24895 24903 24914 24950 24999 25010 25035 25045 25054 25057 25061 25068 25077 25100 25123 25135 25164 25187 25209 25217 25226 25239 25267 25271 25296 25304 25322 25360 25398 25418 25425 25431 25442 25458 25520 25523 25525 25538 25569 25579 25589 25637 25670 25686 25704 25727 25749 25756 25773 25803 25807 25814 25820 25827 25836 25847 25854 25878 25903 25929 25930 25931 25977 25981 26013 26041 26052 26082 26094 26120 26128 26138 26144 26146 26164 26226 26245 26260 26288 26296 26297 26313 26332 26349 26364 26374 26376 26411 26417 26422 26451 26490 26504 26513 26514 26523 26550 26567 26587 26599 26600 26682 26688 26689 26777 26792 26839 26867 26925 26929 26959 26983 26987 26993 26999 27010 27019 27023 27039 27052 27086 27087 27101 27108 27141 27145 27180 27202 27226 27266 27268 27318 27322 27332 27351 27383 27396 27427 27453 27467 27476 27499 27508 27514 27519 27523 27550 27564 27570 27579 27580 27591 27596 27614 27643 27658 27666 27677 27699 27703 27708 27724 27731 27771 27775 27809 27845 27854 27899 27916 27936 27945 27972 27994 28010 28021 28038 28044 28113 28139 28144 28145 28146 28149 28165 28176 28195 28207 28211 28242 28272 28277 28294 28304 28321 28345 28366 28383 28385 28395 28426 28477 28479 28504 28544 28549 28587 28604 28630 28687 28689 28716 28774 28784 28803 28812 28850 28856 28860 28886 28892 28905 28919 28923 28934 28944 28954 28958 28962 29027 29041 29062 29063 29073 29095 29098 29102 29120 29129 29178 29207 29223 29260 29298 29311 29319 29325 29326 29341 29360 29361 29376 29392 29398 29422 29437 29448 29456 29466 29481 29528 29583 29587 29602 29615 29663 29694 29699 29711 29725 29738 29753 29792 29799 29808 29815 29820 29836 29843 29882 29903 29957 29968 30043 30060 30062 30069 30103 30157 30159 30164 30207 30209 30210 30219 30269 30278 30296 30314 30335 30370 30385 30398 30403 30413 30445 30451 30457 30469 30514 30559 30570 30585 30616 30624 30625 30629 30642 30661 30674 30678 30688 30694 30699 30708 30712 30714 30724 30728 30745 30758 30777 30782 30785 30823 30828 30831 30898 30931 30948 30978 31019 31033 31037 31044 31062 31063 31074 31095 31102 31109 31119 31124 31179 31194 31202 31221 31236 31248 31254 31274 31277 31280 31308 31314 31361 31373 31391 31422 31433 31528 31537 31541 31569 31578 31592 31595 31606 31622 31634 31636 31662 31665 31669 31697 31699 31711 31712 31724 31732 31740 31752 31754 31759 31766 31773 31776 31791 31793 31794 31811 31822 31837 31863 31872 31886 31909 31913 31914 31998 31999 32009 32017 32040 32063 32071 32074 32076 32081 32084 32089 32094 32104 32190 32202 32207 32276 32292 32305 32311 32315 32318 32336 32344 32427 32478 32491 32495 32526 32531 32532 32540 32542 32543 
## Number of NA values:   1843 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in occupation variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in occupation variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in relationship variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in relationship variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in relationship variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in race variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in race variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in race variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in sex variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in sex variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in sex variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in capital-gain variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in capital-gain variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in capital-gain variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in capital-loss variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in capital-loss variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in capital-loss variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in hours-per-week variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in hours-per-week variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in hours-per-week variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in native-country variable> 
## 15 39 52 62 94 246 250 298 394 454 558 713 726 730 778 781 888 956 1027 1037 1116 1153 1159 1200 1225 1253 1327 1349 1392 1555 1558 1582 1594 1677 1712 1739 1819 1901 1991 2016 2100 2105 2182 2372 2513 2514 2519 2550 2573 2588 2592 2640 2718 2736 2776 2795 2910 2927 3024 3108 3132 3165 3167 3188 3201 3233 3248 3257 3462 3485 3496 3533 3580 3637 3835 3857 3859 4007 4157 4173 4198 4245 4302 4327 4397 4406 4463 4511 4579 4600 4640 4657 4659 4672 4773 4787 4828 5082 5181 5186 5202 5235 5310 5348 5375 5402 5451 5541 5648 5664 5684 5710 5824 5842 5855 5964 6006 6060 6130 6177 6187 6243 6320 6361 6365 6377 6396 6534 6677 6738 6845 7046 7073 7081 7097 7154 7167 7177 7254 7285 7328 7346 7399 7476 7616 7635 7689 7851 7862 7863 7903 7965 7991 8146 8161 8208 8226 8283 8357 8366 8478 8872 8904 8916 9016 9041 9238 9367 9419 9504 9538 9560 9581 9617 9625 9740 9786 9800 9850 9867 9986 10012 10063 10183 10185 10219 10289 10344 10354 10404 10409 10575 10635 10648 10675 10763 10778 10783 11148 11188 11222 11285 11301 11424 11447 11478 11596 11615 11653 11660 11984 11989 12005 12083 12115 12173 12261 12281 12316 12330 12363 12471 12561 12644 12656 12691 12696 12717 12749 12831 12900 12960 12974 12997 13089 13199 13202 13282 13306 13500 13604 13692 13748 13769 13818 13821 13827 13828 13898 13914 13919 13972 14044 14086 14103 14196 14235 14247 14341 14369 14411 14460 14563 14578 14583 14585 14593 14858 15024 15037 15137 15153 15162 15198 15220 15445 15476 15529 15595 15610 15614 15673 15679 15693 15735 15793 15864 15932 15933 15954 15989 16037 16080 16109 16142 16143 16232 16261 16267 16329 16382 16418 16440 16489 16501 16636 16648 16839 16863 16976 17022 17108 17194 17202 17275 17379 17453 17482 17483 17624 17648 17895 18066 18234 18278 18366 18413 18439 18460 18556 18586 18616 18673 18678 18907 18910 18983 19038 19047 19056 19170 19246 19257 19300 19317 19327 19347 19352 19415 19491 19533 19627 19677 19710 19728 19769 19785 19788 19947 19998 20204 20285 20334 20359 20465 20481 20500 20532 20633 20639 20658 20659 20717 20748 20848 21063 21109 21127 21135 21196 21227 21265 21383 21394 21532 21542 21557 21669 21723 21819 22003 22069 22107 22231 22242 22265 22318 22352 22430 22475 22541 22562 22615 22640 22678 22743 22772 22789 22791 22862 22908 22982 23033 23116 23174 23237 23285 23435 23441 23467 23471 23566 23638 23688 23705 23730 23785 23798 23893 23916 24214 24458 24466 24573 24593 24607 24663 24696 24751 24833 24891 24892 24924 24961 24981 25047 25106 25113 25236 25242 25276 25297 25314 25343 25360 25459 25479 25492 25505 25550 25575 25620 25630 25842 25871 26008 26198 26222 26235 26272 26297 26333 26364 26378 26406 26447 26461 26570 26617 26662 26763 26801 26901 26923 26941 26980 27020 27069 27134 27142 27300 27306 27377 27384 27670 28019 28045 28108 28125 28195 28196 28197 28221 28336 28344 28432 28483 28501 28506 28590 28619 28629 28689 28706 28836 28842 28913 28933 28938 29030 29034 29099 29105 29213 29256 29324 29358 29378 29402 29441 29524 29593 29681 29683 29739 29778 29787 29889 29982 30011 30106 30111 30171 30231 30275 30277 30303 30330 30370 30583 30639 30657 30671 30701 30774 30822 30903 30923 31090 31129 31337 31360 31388 31397 31469 31556 31638 31642 31702 31797 31945 32091 32170 32214 32233 32255 32308 32414 32450 32470 32493 32511 32526 
## Number of NA values:   583 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in native-country variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in native-country variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in income variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in income variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in income variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
#Find missing values and NAs for testing set.
for(i in 1:ncol(test)){
  cat("<names of NA rows in", colnames(test)[i], "variable>", "\n")
  cat(rownames(test)[is.na(test[, i])], "\n")
  cat("Number of NA values:  ", length(rownames(test)[is.na(test[, i])]), "\n")
  print("======================================")
  print("======================================")
  
  cat("<names of rows contain missing values in", colnames(test)[i], "variable>", "\n")
  cat(rownames(test[which(test[, i] == ""), ]), "\n")
  cat("Number of Missing values :  ", length(rownames(test[which(test[, i] == ""), ])), "\n")
  print("======================================")
  print("======================================")
  
  cat("<names of rows contain ? values in", colnames(test)[i], "variable>", "\n")
  cat(rownames(test[which(test[, i] == " ?"), ]), "\n")
  cat("Number of ? values :  ", length(rownames(test[which(test[, i] == " ?"), ])), "\n")
  print("======================================")
  print("======================================")
}
## <names of NA rows in age variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in age variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in age variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in workclass variable> 
## 5 7 14 23 36 76 90 101 114 133 183 186 194 229 230 246 267 269 275 317 332 351 379 395 398 414 430 435 438 471 506 517 564 605 613 627 638 641 642 648 658 665 694 704 718 729 766 769 782 817 874 881 914 916 927 934 961 982 1001 1003 1006 1010 1019 1030 1044 1049 1064 1121 1128 1131 1143 1157 1168 1170 1178 1198 1206 1242 1252 1259 1260 1286 1307 1339 1363 1368 1378 1396 1418 1428 1439 1466 1481 1523 1525 1536 1561 1594 1596 1607 1608 1613 1626 1627 1642 1666 1682 1701 1734 1747 1751 1775 1779 1781 1788 1792 1802 1814 1829 1833 1835 1838 1864 1867 1894 1940 1945 1956 1983 2024 2043 2056 2089 2093 2118 2123 2161 2164 2202 2229 2256 2282 2324 2334 2365 2411 2416 2417 2439 2448 2493 2495 2499 2508 2511 2532 2537 2540 2548 2557 2580 2585 2594 2613 2635 2643 2651 2652 2656 2667 2722 2761 2775 2776 2779 2798 2805 2809 2849 2903 2921 2956 2966 2996 3005 3019 3025 3038 3068 3075 3084 3091 3103 3131 3144 3201 3207 3217 3220 3229 3233 3238 3260 3289 3298 3314 3408 3414 3422 3432 3480 3493 3541 3570 3639 3670 3672 3691 3699 3726 3745 3747 3758 3817 3854 3860 3868 3871 3883 3921 3964 3978 3983 3995 3998 4004 4006 4018 4044 4056 4059 4068 4082 4109 4118 4137 4149 4156 4177 4217 4222 4228 4231 4234 4239 4242 4260 4266 4271 4275 4282 4298 4305 4322 4331 4351 4373 4441 4456 4468 4484 4487 4520 4576 4586 4598 4611 4625 4641 4642 4647 4662 4663 4669 4709 4729 4745 4746 4749 4754 4760 4777 4778 4784 4789 4803 4822 4824 4841 4844 4847 4859 4862 4871 4886 4899 4928 4935 4936 4947 4960 4985 4990 4996 4999 5024 5047 5053 5067 5079 5088 5109 5110 5135 5146 5147 5165 5175 5182 5228 5246 5257 5290 5330 5349 5361 5373 5374 5380 5392 5401 5426 5435 5471 5474 5477 5487 5516 5519 5531 5533 5537 5574 5584 5588 5624 5636 5647 5650 5682 5683 5702 5715 5727 5733 5736 5747 5784 5817 5825 5839 5886 5889 5897 5901 5937 5953 5958 5973 5991 5992 6051 6065 6083 6096 6100 6131 6153 6187 6220 6222 6232 6233 6251 6266 6321 6326 6335 6366 6385 6386 6399 6433 6434 6492 6500 6509 6526 6612 6614 6615 6624 6643 6649 6661 6674 6685 6752 6767 6784 6791 6801 6806 6840 6842 6870 6875 6885 6918 6929 6942 6993 7067 7072 7139 7142 7153 7170 7171 7224 7232 7252 7264 7267 7271 7304 7375 7426 7436 7445 7457 7481 7512 7530 7535 7540 7559 7562 7567 7611 7621 7624 7634 7641 7648 7724 7725 7735 7772 7776 7786 7794 7800 7812 7829 7872 7883 7893 7896 7901 7924 7925 7927 7948 7971 8019 8024 8028 8042 8080 8104 8105 8112 8116 8117 8123 8162 8184 8190 8204 8207 8224 8232 8235 8247 8249 8251 8259 8285 8290 8291 8338 8342 8347 8384 8585 8612 8627 8636 8649 8661 8668 8682 8695 8703 8711 8714 8722 8754 8761 8764 8795 8804 8837 8838 8840 8852 8870 8876 8901 8917 8922 8954 8989 9031 9033 9055 9082 9084 9090 9092 9103 9129 9136 9138 9146 9176 9212 9252 9258 9275 9290 9313 9320 9355 9362 9368 9381 9382 9383 9418 9438 9494 9495 9502 9517 9525 9564 9568 9585 9586 9603 9608 9609 9624 9637 9649 9668 9709 9716 9724 9736 9751 9753 9768 9803 9808 9832 9842 9850 9876 9899 9905 10002 10025 10029 10057 10066 10077 10101 10111 10117 10133 10178 10180 10206 10215 10242 10249 10262 10266 10267 10271 10273 10286 10319 10365 10409 10432 10437 10509 10540 10560 10571 10579 10613 10621 10667 10671 10674 10700 10727 10757 10768 10796 10802 10834 10851 10872 10884 10891 10892 10915 10942 10979 11004 11055 11110 11133 11202 11225 11231 11254 11286 11304 11339 11356 11367 11404 11438 11458 11468 11494 11518 11562 11563 11596 11624 11647 11656 11702 11710 11763 11768 11789 11803 11849 11872 11882 11902 11906 11908 11922 11940 11947 11948 11967 11969 11974 11977 11988 12022 12027 12035 12036 12038 12053 12064 12066 12124 12163 12190 12195 12220 12238 12241 12250 12289 12308 12314 12323 12358 12362 12369 12372 12374 12399 12409 12422 12423 12425 12430 12462 12511 12562 12569 12577 12604 12612 12617 12677 12699 12708 12752 12774 12789 12802 12840 12857 12860 12868 12873 12876 12884 12893 12979 12994 13072 13074 13085 13095 13099 13124 13136 13139 13159 13184 13196 13206 13241 13290 13315 13323 13327 13331 13342 13380 13406 13407 13415 13417 13418 13433 13440 13468 13473 13474 13491 13496 13521 13531 13578 13597 13660 13662 13664 13680 13769 13772 13780 13797 13826 13834 13838 13839 13854 13871 13873 13892 13898 13908 13936 13952 13958 13985 13988 13990 14019 14034 14048 14057 14074 14132 14135 14179 14203 14209 14240 14286 14316 14363 14373 14378 14386 14425 14431 14449 14459 14465 14490 14491 14505 14523 14548 14556 14558 14562 14574 14605 14606 14629 14653 14657 14665 14666 14670 14675 14681 14706 14722 14724 14727 14729 14738 14758 14759 14762 14780 14792 14816 14866 14884 14932 14942 14961 14981 15004 15008 15050 15075 15141 15171 15181 15182 15193 15201 15208 15221 15238 15249 15259 15260 15286 15287 15318 15325 15335 15337 15343 15367 15409 15414 15419 15428 15471 15480 15503 15521 15525 15530 15551 15558 15574 15593 15600 15638 15639 15655 15679 15680 15684 15711 15713 15733 15748 15749 15769 15782 15788 15797 15813 15823 15824 15833 15837 15840 15847 15852 15857 15862 15865 15877 15880 15910 15913 15917 15923 15925 15953 15984 15989 15993 16002 16007 16019 16033 16036 16118 16122 16209 16240 16252 16278 
## Number of NA values:   963 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in workclass variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in workclass variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in fnlwgt variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in fnlwgt variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in fnlwgt variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in education variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in education variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in education variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in education-num variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in education-num variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in education-num variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in marital-status variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in marital-status variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in marital-status variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in occupation variable> 
## 5 7 14 23 36 76 90 101 114 133 183 186 194 229 230 246 267 269 275 317 332 351 379 395 398 414 430 435 438 471 506 517 564 605 613 627 638 641 642 648 658 665 694 704 718 729 766 769 782 817 874 881 914 916 927 934 961 982 1001 1003 1006 1010 1019 1030 1044 1049 1064 1121 1128 1131 1143 1157 1168 1170 1178 1198 1206 1242 1252 1259 1260 1286 1307 1339 1363 1368 1378 1396 1418 1428 1439 1466 1481 1523 1525 1536 1561 1594 1596 1607 1608 1613 1626 1627 1642 1666 1682 1701 1734 1747 1751 1775 1779 1781 1788 1792 1802 1814 1829 1833 1835 1838 1864 1867 1894 1940 1945 1956 1983 2024 2043 2056 2089 2093 2118 2123 2161 2164 2202 2229 2256 2282 2324 2334 2365 2411 2416 2417 2439 2448 2493 2495 2499 2508 2511 2532 2537 2540 2548 2557 2580 2585 2594 2613 2635 2643 2651 2652 2656 2667 2722 2761 2775 2776 2779 2798 2805 2809 2849 2903 2921 2956 2966 2996 3005 3019 3025 3038 3068 3075 3084 3091 3103 3131 3144 3201 3207 3217 3220 3229 3233 3238 3260 3289 3298 3314 3408 3414 3422 3432 3480 3493 3541 3570 3639 3670 3672 3691 3699 3726 3745 3747 3758 3817 3854 3860 3868 3871 3883 3921 3964 3978 3983 3995 3998 4004 4006 4018 4044 4056 4059 4068 4082 4109 4118 4137 4149 4156 4177 4217 4222 4228 4231 4234 4239 4242 4260 4266 4271 4275 4282 4298 4305 4322 4331 4351 4373 4441 4456 4468 4484 4487 4520 4576 4586 4598 4611 4625 4641 4642 4647 4662 4663 4669 4709 4729 4745 4746 4749 4754 4760 4777 4778 4784 4789 4803 4822 4824 4841 4844 4847 4859 4862 4871 4886 4899 4928 4935 4936 4947 4960 4985 4990 4996 4999 5024 5047 5053 5067 5079 5088 5109 5110 5135 5146 5147 5165 5175 5182 5228 5246 5257 5290 5330 5349 5361 5373 5374 5380 5392 5401 5426 5435 5471 5474 5477 5487 5516 5519 5531 5533 5537 5574 5584 5588 5624 5636 5647 5650 5682 5683 5702 5715 5727 5733 5736 5747 5784 5817 5825 5839 5886 5889 5897 5901 5937 5953 5958 5973 5991 5992 6051 6065 6083 6096 6100 6131 6153 6187 6220 6222 6232 6233 6251 6266 6321 6326 6335 6366 6385 6386 6399 6433 6434 6492 6500 6509 6526 6612 6614 6615 6624 6643 6649 6661 6674 6685 6752 6767 6784 6791 6801 6806 6840 6842 6870 6875 6885 6918 6929 6942 6993 7067 7072 7139 7142 7153 7170 7171 7224 7232 7252 7264 7267 7271 7304 7375 7426 7436 7445 7457 7481 7512 7530 7535 7540 7559 7562 7567 7611 7621 7624 7634 7641 7648 7724 7725 7735 7772 7776 7786 7794 7800 7812 7829 7872 7883 7893 7896 7901 7924 7925 7927 7948 7971 8019 8024 8028 8042 8080 8104 8105 8112 8116 8117 8123 8162 8184 8190 8204 8207 8224 8232 8235 8247 8249 8251 8259 8285 8290 8291 8338 8342 8347 8384 8585 8612 8627 8636 8649 8661 8668 8682 8695 8703 8711 8714 8722 8754 8761 8764 8786 8795 8804 8837 8838 8840 8852 8870 8876 8901 8917 8922 8954 8989 9031 9033 9055 9082 9084 9090 9092 9103 9129 9136 9138 9146 9176 9212 9252 9258 9275 9290 9313 9320 9355 9362 9368 9381 9382 9383 9418 9438 9494 9495 9502 9517 9525 9564 9568 9585 9586 9603 9608 9609 9624 9637 9649 9668 9709 9716 9724 9736 9751 9753 9768 9803 9808 9832 9842 9850 9876 9899 9905 10002 10025 10029 10057 10066 10077 10101 10111 10117 10133 10178 10180 10206 10215 10242 10249 10262 10266 10267 10271 10273 10286 10319 10365 10409 10432 10437 10509 10540 10560 10571 10579 10613 10621 10667 10671 10674 10700 10727 10757 10768 10796 10802 10834 10851 10872 10884 10891 10892 10915 10942 10979 11004 11055 11110 11133 11202 11225 11231 11254 11286 11304 11339 11356 11367 11404 11438 11458 11468 11494 11518 11562 11563 11596 11608 11624 11647 11656 11702 11710 11763 11768 11789 11803 11849 11872 11882 11902 11906 11908 11922 11940 11947 11948 11967 11969 11974 11977 11988 12022 12027 12035 12036 12038 12053 12064 12066 12124 12163 12190 12195 12220 12238 12241 12250 12289 12308 12314 12323 12358 12362 12369 12372 12374 12399 12409 12422 12423 12425 12430 12462 12511 12562 12569 12577 12604 12612 12617 12677 12699 12708 12752 12774 12789 12802 12840 12857 12860 12868 12873 12876 12884 12893 12979 12994 13072 13074 13085 13095 13099 13124 13136 13139 13159 13184 13196 13206 13241 13290 13315 13323 13327 13331 13342 13380 13406 13407 13415 13417 13418 13433 13440 13468 13473 13474 13491 13496 13521 13531 13578 13597 13660 13662 13664 13680 13769 13772 13780 13797 13826 13834 13838 13839 13854 13871 13873 13892 13898 13899 13908 13936 13952 13958 13985 13988 13990 14019 14034 14048 14057 14074 14132 14135 14179 14203 14209 14240 14286 14316 14363 14373 14378 14386 14425 14431 14449 14459 14465 14490 14491 14505 14523 14548 14556 14558 14562 14574 14605 14606 14629 14653 14657 14665 14666 14670 14675 14681 14706 14722 14724 14727 14729 14738 14758 14759 14762 14780 14792 14816 14866 14884 14932 14942 14961 14981 15004 15008 15050 15075 15141 15171 15181 15182 15193 15201 15208 15221 15238 15249 15259 15260 15286 15287 15318 15325 15335 15337 15343 15367 15409 15414 15419 15428 15471 15480 15503 15521 15525 15530 15551 15558 15574 15593 15600 15638 15639 15655 15679 15680 15684 15711 15713 15733 15748 15749 15769 15782 15788 15797 15813 15823 15824 15833 15837 15840 15847 15852 15857 15862 15865 15877 15880 15910 15913 15917 15923 15925 15953 15984 15989 15993 16002 16007 16019 16033 16036 16118 16122 16209 16240 16252 16278 
## Number of NA values:   966 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in occupation variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in occupation variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in relationship variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in relationship variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in relationship variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in race variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in race variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in race variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in sex variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in sex variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in sex variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in capital-gain variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in capital-gain variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in capital-gain variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in capital-loss variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in capital-loss variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in capital-loss variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in hours-per-week variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in hours-per-week variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in hours-per-week variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in native-country variable> 
## 20 66 84 189 254 306 330 404 421 472 516 649 666 688 844 1009 1039 1164 1334 1365 1406 1616 1644 1801 1822 1823 1832 1941 2061 2096 2107 2161 2227 2264 2305 2318 2324 2350 2477 2489 2552 2585 2613 2630 2697 2703 2775 2886 3061 3075 3122 3160 3222 3440 3460 3485 3508 3672 3678 3730 3762 3786 3854 3867 4187 4409 4540 4545 4608 4643 4649 4697 4728 4748 4764 4911 4923 5053 5126 5149 5152 5171 5181 5420 5469 5497 5648 5662 5717 5732 5829 5837 5944 5973 6034 6048 6054 6180 6206 6208 6234 6372 6403 6518 6587 6762 6776 6798 6801 6863 6871 6876 7017 7047 7060 7167 7206 7232 7288 7355 7443 7598 7601 7677 7708 7721 7750 7817 8029 8044 8078 8161 8183 8265 8369 8378 8433 8600 8622 8634 8700 8774 8849 8938 8976 9057 9145 9180 9200 9240 9244 9254 9263 9297 9335 9340 9354 9358 9415 9436 9497 9552 9567 9581 9626 9635 9699 9740 9874 9957 9983 10048 10151 10157 10202 10208 10267 10334 10346 10356 10364 10409 10475 10476 10509 10711 10739 10842 11130 11314 11348 11390 11407 11610 11686 11733 11749 11762 11784 11889 11946 12371 12386 12398 12415 12436 12456 12506 12577 12579 12607 12626 12648 12725 12780 12797 12911 12990 13171 13241 13254 13293 13311 13362 13547 13550 13575 13614 13693 13721 13746 13760 13764 13792 13926 13931 13934 13971 13980 14005 14029 14030 14072 14189 14203 14225 14263 14334 14373 14407 14446 14547 14585 14611 14652 14732 15006 15015 15084 15091 15099 15185 15234 15321 15350 15397 15421 15481 15594 15685 15712 16044 16091 16266 
## Number of NA values:   274 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in native-country variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in native-country variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of NA rows in income variable> 
##  
## Number of NA values:   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain missing values in income variable> 
##  
## Number of Missing values :   0 
## [1] "======================================"
## [1] "======================================"
## <names of rows contain ? values in income variable> 
##  
## Number of ? values :   0 
## [1] "======================================"
## [1] "======================================"
#Get percentage of missing values
apply(train, 2, function(x) sum(is.na(x)) / length(x)) * 100
##            age      workclass         fnlwgt      education  education-num 
##       0.000000       5.638647       0.000000       0.000000       0.000000 
## marital-status     occupation   relationship           race            sex 
##       0.000000       5.660146       0.000000       0.000000       0.000000 
##   capital-gain   capital-loss hours-per-week native-country         income 
##       0.000000       0.000000       0.000000       1.790486       0.000000
apply(test, 2, function(x) sum(is.na(x)) / length(x)) * 100
##            age      workclass         fnlwgt      education  education-num 
##       0.000000       5.914870       0.000000       0.000000       0.000000 
## marital-status     occupation   relationship           race            sex 
##       0.000000       5.933296       0.000000       0.000000       0.000000 
##   capital-gain   capital-loss hours-per-week native-country         income 
##       0.000000       0.000000       0.000000       1.682943       0.000000
#MICE package to see the pattern 
md.pattern(train)
##       age fnlwgt education education-num marital-status relationship race
## 30162   1      1         1             1              1            1    1
##     7   1      1         1             1              1            1    1
##   556   1      1         1             1              1            1    1
##  1809   1      1         1             1              1            1    1
##    27   1      1         1             1              1            1    1
##         0      0         0             0              0            0    0
##       sex capital-gain capital-loss hours-per-week income native-country
## 30162   1            1            1              1      1              1
##     7   1            1            1              1      1              1
##   556   1            1            1              1      1              0
##  1809   1            1            1              1      1              1
##    27   1            1            1              1      1              0
##         0            0            0              0      0            583
##       workclass occupation     
## 30162         1          1    0
##     7         1          0    1
##   556         1          1    1
##  1809         0          0    2
##    27         0          0    3
##            1836       1843 4262
plot <- aggr(train, col = c('blue', 'yellow'),
                    numbers = TRUE, sortVars = TRUE,
                    labels = names(train), cex.axis = .7,
                    gap = 2, ylab = c("Missing data", "Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies

## 
##  Variables sorted by number of missings: 
##        Variable      Count
##      occupation 0.05660146
##       workclass 0.05638647
##  native-country 0.01790486
##             age 0.00000000
##          fnlwgt 0.00000000
##       education 0.00000000
##   education-num 0.00000000
##  marital-status 0.00000000
##    relationship 0.00000000
##            race 0.00000000
##             sex 0.00000000
##    capital-gain 0.00000000
##    capital-loss 0.00000000
##  hours-per-week 0.00000000
##          income 0.00000000
md.pattern(test)
##       age fnlwgt education education-num marital-status relationship race
## 15060   1      1         1             1              1            1    1
##     3   1      1         1             1              1            1    1
##   255   1      1         1             1              1            1    1
##   944   1      1         1             1              1            1    1
##    19   1      1         1             1              1            1    1
##         0      0         0             0              0            0    0
##       sex capital-gain capital-loss hours-per-week income native-country
## 15060   1            1            1              1      1              1
##     3   1            1            1              1      1              1
##   255   1            1            1              1      1              0
##   944   1            1            1              1      1              1
##    19   1            1            1              1      1              0
##         0            0            0              0      0            274
##       workclass occupation     
## 15060         1          1    0
##     3         1          0    1
##   255         1          1    1
##   944         0          0    2
##    19         0          0    3
##             963        966 2203
plot <- aggr(test, col = c('blue', 'yellow'),
                    numbers = TRUE, sortVars = TRUE,
                    labels = names(test), cex.axis = .7,
                    gap = 2, ylab = c("Missing data", "Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies

## 
##  Variables sorted by number of missings: 
##        Variable      Count
##      occupation 0.05933296
##       workclass 0.05914870
##  native-country 0.01682943
##             age 0.00000000
##          fnlwgt 0.00000000
##       education 0.00000000
##   education-num 0.00000000
##  marital-status 0.00000000
##    relationship 0.00000000
##            race 0.00000000
##             sex 0.00000000
##    capital-gain 0.00000000
##    capital-loss 0.00000000
##  hours-per-week 0.00000000
##          income 0.00000000
# Hmisc package to impute missing values
# ww <- aregImpute(~ age + workclass + fnlwgt + education + `education-num` + `marital-status` +
#                    occupation + relationship + race + sex + `capital-gain` + `capital-loss` +
#                    `hours-per-week` + income,
#                  data = train, n.impute = 5, group = "income")



#mlr package to impute missing values
# newworkclass <- impute(train[,2], classes = list(factor = imputeMode(), integer = imputeMean()), dummy.classes = c("integer","factor"), dummy.type = "numeric")
# 
# newoccupation <- impute(train[,7], classes = list(factor = imputeMode(), integer = imputeMean()), dummy.classes = c("integer","factor"), dummy.type = "numeric")
# 
# newcountry <- impute(train[,14], classes = list(factor = imputeMode(), integer = imputeMean()), dummy.classes = c("integer","factor"), dummy.type = "numeric")



#missForest package to impute missing values
# foresting <- missForest(train, maxiter = 5, ntree = 100)
# foresting$OOBerror
# newtrain <- foresting$ximp
# write.csv(newtrain, file = "../data/cleandata/newtrain.csv", col.names = T, row.names = F)
newtrain <- read.csv("../data/cleandata/newtrain.csv", header = T)
dim(newtrain)
## [1] 32561    15
# foresting2 <- missForest(test, maxiter = 5, ntree = 100)
# foresting2$OOBerror
# newtest <- foresting2$ximp
# write.csv(newtest, file = "../data/cleandata/newtest.csv", col.names = T, row.names = F)
newtest <- read.csv("../data/cleandata/newtest.csv", header = T)
dim(newtest)
## [1] 16281    15
#Check whether the data is messed up while imputing missing values
#They should never show 0, as we are supposed to see only missing value has been changed...
#Compare NA with new number in new data set should show NA, not 0.
t <- matrix(0, 1, ncol(train))
for(i in 1:20){
  a <- sample.int(nrow(newtrain), 1)
  t <- rbind(t, (newtrain[a, ] == train[a, ]))
}
t <- t[-1, ]
t
##       age workclass fnlwgt education education.num marital.status
## 3051    1         1      1         1             1              1
## 31315   1         1      1         1             1              1
## 6084    1         1      1         1             1              1
## 29934   1         1      1         1             1              1
## 27188   1         1      1         1             1              1
## 25542   1         1      1         1             1              1
## 18652   1         1      1         1             1              1
## 3224    1         1      1         1             1              1
## 32399   1         1      1         1             1              1
## 9143    1         1      1         1             1              1
## 29546   1         1      1         1             1              1
## 26228   1         1      1         1             1              1
## 6168    1         1      1         1             1              1
## 30951   1         1      1         1             1              1
## 17038   1         1      1         1             1              1
## 26789   1         1      1         1             1              1
## 17202   1         1      1         1             1              1
## 29103   1         1      1         1             1              1
## 20814   1         1      1         1             1              1
## 7638    1         1      1         1             1              1
##       occupation relationship race sex capital.gain capital.loss
## 3051           1            1    1   1            1            1
## 31315          1            1    1   1            1            1
## 6084           1            1    1   1            1            1
## 29934          1            1    1   1            1            1
## 27188          1            1    1   1            1            1
## 25542          1            1    1   1            1            1
## 18652          1            1    1   1            1            1
## 3224           1            1    1   1            1            1
## 32399          1            1    1   1            1            1
## 9143           1            1    1   1            1            1
## 29546          1            1    1   1            1            1
## 26228          1            1    1   1            1            1
## 6168           1            1    1   1            1            1
## 30951          1            1    1   1            1            1
## 17038          1            1    1   1            1            1
## 26789          1            1    1   1            1            1
## 17202          1            1    1   1            1            1
## 29103          1            1    1   1            1            1
## 20814          1            1    1   1            1            1
## 7638           1            1    1   1            1            1
##       hours.per.week native.country income
## 3051               1              1      1
## 31315              1              1      1
## 6084               1              1      1
## 29934              1              1      1
## 27188              1              1      1
## 25542              1              1      1
## 18652              1              1      1
## 3224               1              1      1
## 32399              1              1      1
## 9143               1              1      1
## 29546              1              1      1
## 26228              1              1      1
## 6168               1              1      1
## 30951              1              1      1
## 17038              1              1      1
## 26789              1              1      1
## 17202              1             NA      1
## 29103              1              1      1
## 20814              1              1      1
## 7638               1              1      1
t2 <- matrix(0, 1, ncol(test))
for(i in 1:20){
  a <- sample.int(nrow(newtest), 1)
  t2 <- rbind(t2, (newtest[a, ] == test[a, ]))
}
t2 <- t2[-1, ]
t2
##       age workclass fnlwgt education education.num marital.status
## 2501    1         1      1         1             1              1
## 1628    1         1      1         1             1              1
## 7916    1         1      1         1             1              1
## 2844    1         1      1         1             1              1
## 3819    1         1      1         1             1              1
## 5284    1         1      1         1             1              1
## 3915    1         1      1         1             1              1
## 12315   1         1      1         1             1              1
## 2626    1         1      1         1             1              1
## 6014    1         1      1         1             1              1
## 5583    1         1      1         1             1              1
## 11575   1         1      1         1             1              1
## 14876   1         1      1         1             1              1
## 12425   1        NA      1         1             1              1
## 8425    1         1      1         1             1              1
## 5838    1         1      1         1             1              1
## 511     1         1      1         1             1              1
## 1575    1         1      1         1             1              1
## 9847    1         1      1         1             1              1
## 2292    1         1      1         1             1              1
##       occupation relationship race sex capital.gain capital.loss
## 2501           1            1    1   1            1            1
## 1628           1            1    1   1            1            1
## 7916           1            1    1   1            1            1
## 2844           1            1    1   1            1            1
## 3819           1            1    1   1            1            1
## 5284           1            1    1   1            1            1
## 3915           1            1    1   1            1            1
## 12315          1            1    1   1            1            1
## 2626           1            1    1   1            1            1
## 6014           1            1    1   1            1            1
## 5583           1            1    1   1            1            1
## 11575          1            1    1   1            1            1
## 14876          1            1    1   1            1            1
## 12425         NA            1    1   1            1            1
## 8425           1            1    1   1            1            1
## 5838           1            1    1   1            1            1
## 511            1            1    1   1            1            1
## 1575           1            1    1   1            1            1
## 9847           1            1    1   1            1            1
## 2292           1            1    1   1            1            1
##       hours.per.week native.country income
## 2501               1              1      1
## 1628               1              1      1
## 7916               1              1      1
## 2844               1              1      1
## 3819               1              1      1
## 5284               1              1      1
## 3915               1              1      1
## 12315              1              1      1
## 2626               1              1      1
## 6014               1              1      1
## 5583               1              1      1
## 11575              1              1      1
## 14876              1              1      1
## 12425              1              1      1
## 8425               1              1      1
## 5838               1              1      1
## 511                1              1      1
## 1575               1              1      1
## 9847               1              1      1
## 2292               1              1      1

\(\\\)

\(\\\)

b) 2 - 5 EDAs

#See structure and summaries before removing outliers
str(newtest)
## 'data.frame':    16281 obs. of  15 variables:
##  $ age           : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ workclass     : Factor w/ 8 levels "Federal-gov",..: 4 4 2 4 4 4 4 6 4 4 ...
##  $ fnlwgt        : int  226802 89814 336951 160323 103497 198693 227026 104626 369667 104996 ...
##  $ education     : Factor w/ 16 levels "10th","11th",..: 2 12 8 16 16 1 12 15 16 6 ...
##  $ education.num : int  7 9 12 10 10 6 9 15 10 4 ...
##  $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
##  $ occupation    : Factor w/ 14 levels "Adm-clerical",..: 7 5 11 7 12 8 6 10 8 3 ...
##  $ relationship  : Factor w/ 6 levels "Husband","Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
##  $ race          : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
##  $ sex           : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
##  $ capital.gain  : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: int  40 50 40 40 30 30 40 32 40 10 ...
##  $ native.country: Factor w/ 40 levels "Cambodia","Canada",..: 38 38 38 38 38 38 38 38 38 38 ...
##  $ income        : Factor w/ 2 levels "<=50K.",">50K.": 1 1 2 2 1 1 1 2 1 1 ...
summary(newtest)
##       age                   workclass         fnlwgt       
##  Min.   :17.00   Private         :11963   Min.   :  13492  
##  1st Qu.:28.00   Self-emp-not-inc: 1433   1st Qu.: 116736  
##  Median :37.00   Local-gov       : 1090   Median : 177831  
##  Mean   :38.77   State-gov       :  710   Mean   : 189436  
##  3rd Qu.:48.00   Self-emp-inc    :  594   3rd Qu.: 238384  
##  Max.   :90.00   Federal-gov     :  481   Max.   :1490400  
##                  (Other)         :   10                    
##         education    education.num                 marital.status
##  HS-grad     :5283   Min.   : 1.00   Divorced             :2190  
##  Some-college:3587   1st Qu.: 9.00   Married-AF-spouse    :  14  
##  Bachelors   :2670   Median :10.00   Married-civ-spouse   :7403  
##  Masters     : 934   Mean   :10.07   Married-spouse-absent: 210  
##  Assoc-voc   : 679   3rd Qu.:12.00   Never-married        :5434  
##  11th        : 637   Max.   :16.00   Separated            : 505  
##  (Other)     :2491                   Widowed              : 525  
##            occupation           relationship                  race      
##  Prof-specialty :2111   Husband       :6523   Amer-Indian-Eskimo:  159  
##  Craft-repair   :2040   Not-in-family :4278   Asian-Pac-Islander:  480  
##  Exec-managerial:2035   Other-relative: 525   Black             : 1561  
##  Adm-clerical   :1967   Own-child     :2513   Other             :  135  
##  Sales          :1921   Unmarried     :1679   White             :13946  
##  Other-service  :1825   Wife          : 763                             
##  (Other)        :4382                                                   
##      sex         capital.gain    capital.loss    hours.per.week 
##  Female: 5421   Min.   :    0   Min.   :   0.0   Min.   : 1.00  
##  Male  :10860   1st Qu.:    0   1st Qu.:   0.0   1st Qu.:40.00  
##                 Median :    0   Median :   0.0   Median :40.00  
##                 Mean   : 1082   Mean   :  87.9   Mean   :40.39  
##                 3rd Qu.:    0   3rd Qu.:   0.0   3rd Qu.:45.00  
##                 Max.   :99999   Max.   :3770.0   Max.   :99.00  
##                                                                 
##        native.country     income     
##  United-States:14892   <=50K.:12435  
##  Mexico       :  311   >50K. : 3846  
##  Philippines  :  111                 
##  Puerto-Rico  :   70                 
##  Germany      :   69                 
##  Canada       :   61                 
##  (Other)      :  767
str(newtrain)
## 'data.frame':    32561 obs. of  15 variables:
##  $ age           : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ workclass     : Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
##  $ fnlwgt        : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
##  $ education     : Factor w/ 16 levels "10th","11th",..: 10 10 12 2 10 13 7 12 13 10 ...
##  $ education.num : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
##  $ occupation    : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
##  $ relationship  : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
##  $ race          : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
##  $ sex           : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
##  $ capital.gain  : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: int  40 13 40 40 40 40 16 45 50 40 ...
##  $ native.country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
##  $ income        : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
summary(newtrain)
##       age                   workclass         fnlwgt       
##  Min.   :17.00   Private         :24068   Min.   :  12285  
##  1st Qu.:28.00   Self-emp-not-inc: 2776   1st Qu.: 117827  
##  Median :37.00   Local-gov       : 2193   Median : 178356  
##  Mean   :38.58   State-gov       : 1352   Mean   : 189778  
##  3rd Qu.:48.00   Self-emp-inc    : 1164   3rd Qu.: 237051  
##  Max.   :90.00   Federal-gov     :  985   Max.   :1484705  
##                  (Other)         :   23                    
##         education     education.num                 marital.status 
##  HS-grad     :10501   Min.   : 1.00   Divorced             : 4443  
##  Some-college: 7291   1st Qu.: 9.00   Married-AF-spouse    :   23  
##  Bachelors   : 5355   Median :10.00   Married-civ-spouse   :14976  
##  Masters     : 1723   Mean   :10.08   Married-spouse-absent:  418  
##  Assoc-voc   : 1382   3rd Qu.:12.00   Never-married        :10683  
##  11th        : 1175   Max.   :16.00   Separated            : 1025  
##  (Other)     : 5134                   Widowed              :  993  
##            occupation           relationship                   race      
##  Prof-specialty :4295   Husband       :13193   Amer-Indian-Eskimo:  311  
##  Craft-repair   :4162   Not-in-family : 8305   Asian-Pac-Islander: 1039  
##  Exec-managerial:4129   Other-relative:  981   Black             : 3124  
##  Adm-clerical   :3992   Own-child     : 5068   Other             :  271  
##  Sales          :3715   Unmarried     : 3446   White             :27816  
##  Other-service  :3696   Wife          : 1568                             
##  (Other)        :8572                                                    
##      sex         capital.gain    capital.loss    hours.per.week 
##  Female:10771   Min.   :    0   Min.   :   0.0   Min.   : 1.00  
##  Male  :21790   1st Qu.:    0   1st Qu.:   0.0   1st Qu.:40.00  
##                 Median :    0   Median :   0.0   Median :40.00  
##                 Mean   : 1078   Mean   :  87.3   Mean   :40.44  
##                 3rd Qu.:    0   3rd Qu.:   0.0   3rd Qu.:45.00  
##                 Max.   :99999   Max.   :4356.0   Max.   :99.00  
##                                                                 
##        native.country    income     
##  United-States:29675   <=50K:24720  
##  Mexico       :  657   >50K : 7841  
##  Philippines  :  211                
##  Germany      :  137                
##  Canada       :  121                
##  Puerto-Rico  :  114                
##  (Other)      : 1646
#Deal with outliers for training sets
continuouscol <- c(1, 3, 5, 11, 12, 13) #subset continous variables

par(mfrow = c(2, 3))
for(i in continuouscol){
  boxplot(newtrain[, i], main = paste("boxplot for", colnames(newtrain[i])),
          xlab = colnames(newtrain[i]))
}

for(i in continuouscol){
  den_acc <- density(newtrain[, i], adjust = 1)
  plot(den_acc, main = paste("density plot for", colnames(newtrain[i])))
  polygon(den_acc, col = "red", border = "blue")
}

outlierstrain <- list()
for(i in continuouscol){
  outliers <- boxplot.stats(newtrain[, i])$out
  numbers <- length(outliers)
  outlierstrain[[i]] <- list(outliers, numbers)
}
head(outlierstrain)
## [[1]]
## [[1]][[1]]
##   [1] 79 90 80 81 90 88 90 90 80 90 81 82 79 81 80 83 90 90 79 81 90 90 80
##  [24] 90 90 79 79 84 90 80 90 81 83 84 81 79 85 82 79 80 90 90 90 84 80 90
##  [47] 90 79 84 90 79 90 90 90 82 81 90 84 79 81 82 81 80 90 80 84 82 79 90
##  [70] 84 90 83 79 81 80 79 80 79 80 90 90 80 90 90 81 83 82 90 90 81 80 80
##  [93] 90 79 80 82 85 80 79 90 81 79 80 79 81 82 88 90 82 88 84 83 79 86 90
## [116] 90 82 83 81 79 90 80 81 79 84 84 79 90 80 81 81 81 90 87 90 80 80 82
## [139] 90 90 85 82 81
## 
## [[1]][[2]]
## [1] 143
## 
## 
## [[2]]
## NULL
## 
## [[3]]
## [[3]][[1]]
##   [1]  544091  507875  446839  432376  494223  428030  483777  633742
##   [9]  523910  635913  538583  477983  425161  860348  423158  481060
##  [17]  416103  445382 1033222  426017  543162  433665  462440  556660
##  [25]  430828  475028  420537  680390  499233  543028  465507  526968
##  [33]  767403  431192  520586  445824  416745  444304  441454  421132
##  [41]  795830  419721  509350  467108  444554  449257  441620  563883
##  [49]  431745  436006  473040  910398  451940  428350  421871  443040
##  [57]  420895  496743  429507  418324  538319  508336  445382  483201
##  [65]  452205  672412  473547  421065  505119  460046  549430  441591
##  [73]  438696  488720  482082  460835  519627  675421  481987  758700
##  [81]  509364  432565  490332  466224  446219  423460  509364  656036
##  [89]  443508  566117  436253  454508  427686  548510  545483  503012
##  [97]  573583  511361  454941  452405  716416  480861  498785  637222
## [105]  430084  423770  417657  446358  457402  664821  462890  598606
## [113]  457237  465326  503923  572751  580248  519006  617021  437994
## [121]  596776  588905  517995  640383  504725  423863  420917  470663
## [129]  611029  437851  495888  549341  421837  746786  550848  510072
## [137]  449432  430471  416129  511331  446559  452640  456399  469705
## [145]  656036  488720  434710  449354  425627  417136  460835  416338
## [153]  424079  423561  688355  587310  628797  421449  424988  443508
## [161]  632613  499249  445758  416164  473133  450580  506329  445168
## [169]  516337  432376  571853 1184622  913447  476573  632593  595000
## [177]  703067  484475  476391  749105  459465  543922  420282  498325
## [185]  447579  420749  482732  437281  427965  505980  549349  496025
## [193]  562558  642830  435022  443546  523095  436770  436493  704108
## [201]  557082  477106  471452  426001  464536  451996  505980  454614
## [209]  473748  506858  434102  454989  537222  595000  454508  577521
## [217]  424012  431426  604506  564135  427781  469907  503675  444089
## [225]  435835  512103  716066  487486  484298  479765  444743  483596
## [233]  525878  423250  538443  493034  434292  496382  432154  528616
## [241]  515025  433491  421223  428350  446358  455995  659273  435604
## [249]  425092  452924  541737  444822  423024  445940  468706  428584
## [257]  972354  459189  498216  608184  444219  433788  586657 1226583
## [265]  664670  447346  504725  427055  561334  499001  791084  917220
## [273]  430084  508548  511289  416577  512992  431745  427862  637080
## [281]  431861  671292  442612  494638  431307  459007  517000  421446
## [289]  548361  648223  522881  433669  461678  416059  473836  745768
## [297]  523067  508891  486332  418176  417419  464945  454508  476653
## [305]  488706  647882  569761  585203  539563 1038553  567788  732569
## [313]  416165  721161  509629  474136  450924  477697  423711  419658
## [321]  553473  496414  421967  453067  466458  421561  483530  560804
## [329]  447079  528616  485496  425528  502316  467799  469921  444134
## [337]  443179  497300  426431  607848  501172  441700  483822  420973
## [345]  514033  470663  472604  487411  558183  416829  430005  426263
## [353]  439608  456236  420779  541282  518030  459248  548580  526528
## [361]  447739  586657  433375  581071  437727  575442  554986  592930
## [369]  632834  423052  504951  484861  449576  496538  459463  505438
## [377]  479482  467108  467108  849857  426562  558944  420054  691903
## [385]  419691  684015  423605  461678  466498  530099  554317  420054
## [393]  450920  427952  695136  698418  464103  526968  450695  548303
## [401]  529216  526164  506436  439919  734193  737315  544686  468713
## [409]  548361  556652  691830  520775  442429  433669  607799  660870
## [417]  440456  471990  483822  423222  500509  487742  498785  423064
## [425]  532379  426895  493862  424855  469602  432555  424468  428271
## [433]  464502  446140  480717  529104  456110  451744  680390  438711
## [441]  483450  419053  857532  454063 1484705  424034  421837  425447
## [449]  456956  434467  755858  523484  436861  654141  469864  424034
## [457]  458549  930948  664366  420629  456236  515629  606111  463667
## [465]  431637  509364  634226  458558  483261  420749  446358  428405
## [473]  451996  423297  568490  447882  450246  456236  448626 1268339
## [481]  467579  455995  698363  617860  615893  427382  565313  591711
## [489]  520231  461337  419554  460408  454915  448337  536725  472070
## [497]  430175  446771  485117  500002  462294  443508  418020  435638
## [505]  420277  511517  438139  462255 1366120  495061  420351  431245
## [513]  434894  441210  419394  593246  449432  473133  440138  462838
## [521]  423222  529223  456618  651396  451951  431861  517036  436361
## [529]  497788  529216  441637  526734  543042  428299  427744  501144
## [537]  417668  631947  489085  436798  443855  438427  437890  540712
## [545]  549174  460437  806552  604537  487085  436341  473748  484024
## [553] 1455435  445382  659504  416745  439263  556688  750972  424884
## [561]  607848  454915  419895  548256  493363  463194  450695  422149
## [569]  552354  469056  435503  561489  455361  578377  509500  889965
## [577]  462180  506329  428499  507086  419732  659558  440129  609935
## [585]  521400  608184  425804  415913  513660  424478  422960  445728
## [593]  467108  615367  557236  562336  427474  493443  443546  430554
## [601]  434097  520078  460408  454934  474617  485117  456618  660461
## [609]  423222  442035  533147  497253  617898  449354  419722  440607
## [617]  442045  450544  953588  425622  609789  598995  421633  609789
## [625]  424719  482732  469697  452283  663394  417668  530454  494784
## [633]  436107  543477  452452  481096  420054  495982  556902  421412
## [641]  432052  418405  732102  548256  476334  709445  463072  469454
## [649]  423616  456604  609789  570821  438176  416356  421561  636017
## [657]  703107  544792  434463  434114  423222  418961  595088  438996
## [665]  607848  433705  462832  476334  527162  470875  416415  456572
## [673]  422836  566049  602513  509060  448026  491000  488541  520033
## [681]  554206  429346  455379  443742  520759  421837  694812  578701
## [689]  422013  462869  456618  549413  598802  511289  464103  462294
## [697]  427422  440417  439919  424494  806316  459548  541343  438839
## [705]  439592 1033222  424468  599629  571017  416577  425199  738812
## [713]  497280  447066  477209  431513  618191  544268  557853  535978
## [721]  668319  423024  491421  682947  469572  574271  456460  478829
## [729]  816750  597843  442274  595461  553405  506329  704108  481987
## [737]  460408  515712  551962  572751  745817  422933  473171  481175
## [745]  433170  476558  420986  447488  446512  497486  433330  496856
## [753] 1161363  435836  424591  425049  441542  419691  433330  444607
## [761]  459342  452808  427474  447555  422718  673764  424494  418405
## [769]  446654  434467  479621  472789  454843  456062  588484  809585
## [777]  493689  445382  482927  503454  574271  462820  478994  434268
## [785]  501671  594187  439779  509462  435469  548664  422813  498079
## [793]  431515  447488  466502  558490  456661  509048  419146  468713
## [801]  653574  706026  511068  427965  452640  475324  470203  513416
## [809]  421561  417941  535978  422249  442274  721712  615367  472580
## [817]  549174  437825 1097453  423222  461715  471452  426836  442131
## [825]  477867  461929  478380  479611  419146  472807  515797  475322
## [833]  510072  570562  491000  419134  423024  473133 1085515  500720
## [841]  421633  511668  455361  521665  478457  548361  591711  518530
## [849]  594187  417668  452406  499197  434430  509866  504871  695411
## [857]  420986  442359  462966  761006  484669  423616  467611  440647
## [865]  506830  574005  478205  604045  465974  415913  605502  589809
## [873]  426467  487347  588003  509629  431426  429897  709798  561334
## [881]  481987  570002  443546 1125613  454915  440706  532845  498328
## [889]  604380  583755  437909  420691  510072  557349  501172  609789
## [897]  476599  424094  557644  706180  425785  606752  417668  673764
## [905]  460214  475324  547886  554206  430035  456236  419740  462832
## [913]  440129  584790  425804  481987  799281  657397  496526  426431
## [921]  440969  487330  444554  512771  466325  440969  512828  422275
## [929]  531055  437666  472166  653574  417605  502837  444304  436798
## [937]  745768  478346  857532  715938  747719  569930  423217  433989
## [945]  475322  585361  452402  425497  502752  492263  543922  766115
## [953]  461337  421561  456922  584259  493034  538822  542265  430283
## [961]  498349  431245  491862  420895  448337  418702  477505  421467
## [969]  469454  749636  433906  437727  668362  449101  981628  470368
## [977]  746432  451059  499935  473625  566537  456367  455553  693066
## [985]  539864  447346  478315  427686  435842  485710  436163  514716
## 
## [[3]][[2]]
## [1] 992
## 
## 
## [[4]]
## NULL
## 
## [[5]]
## [[5]][[1]]
##    [1] 4 3 4 4 2 4 3 4 2 1 4 4 3 3 3 4 2 2 2 3 3 2 4 4 4 3 4 4 3 3 4 3 2 1
##   [35] 4 4 4 4 2 2 3 3 4 3 4 3 4 4 3 2 4 4 4 4 3 4 4 4 4 4 4 2 4 4 4 4 3 3
##   [69] 4 3 4 4 4 4 4 4 4 4 3 4 3 4 4 2 2 3 3 4 3 2 4 4 4 3 3 2 2 4 3 4 1 4
##  [103] 1 4 4 4 3 3 4 3 4 4 4 2 4 3 4 3 3 3 1 4 4 4 4 4 1 4 4 4 3 3 4 4 4 4
##  [137] 4 3 4 4 3 2 4 4 4 1 3 4 4 4 4 2 2 4 4 4 2 4 4 3 4 4 4 4 2 4 4 4 3 4
##  [171] 3 3 3 4 2 4 4 2 4 4 4 3 4 4 4 3 4 3 4 3 4 3 4 2 3 3 4 4 3 3 4 2 4 3
##  [205] 2 2 4 4 2 2 4 4 2 2 3 3 3 4 3 4 4 4 4 4 1 4 3 4 4 4 4 3 4 4 4 1 4 4
##  [239] 4 4 4 4 4 4 1 3 4 1 4 4 2 4 2 4 4 4 3 3 3 4 4 4 4 3 2 2 4 4 3 4 4 2
##  [273] 4 1 4 4 4 4 4 4 4 4 3 1 1 1 4 4 4 2 4 3 3 3 4 2 4 4 4 3 2 4 4 4 2 4
##  [307] 1 4 4 4 4 3 2 2 4 4 4 3 3 3 2 2 4 3 4 3 4 4 4 4 3 4 3 4 4 3 4 4 4 3
##  [341] 4 4 3 3 4 3 4 2 3 2 4 3 2 3 4 4 4 2 4 4 4 4 3 3 4 4 2 4 3 1 3 2 4 3
##  [375] 3 4 3 3 4 4 2 4 3 2 3 4 3 4 4 3 3 2 4 4 4 3 4 3 4 1 4 4 2 2 4 3 1 4
##  [409] 3 3 4 3 4 4 4 3 3 3 4 3 1 4 2 2 4 3 3 3 2 4 4 4 3 4 4 2 3 4 4 3 3 4
##  [443] 3 4 4 4 4 4 4 4 3 2 4 3 4 4 3 2 4 2 4 4 4 3 4 3 4 4 4 2 4 4 3 3 4 3
##  [477] 1 3 2 3 2 4 4 4 3 4 2 2 4 2 2 3 4 2 3 4 3 3 4 4 4 3 2 3 3 3 4 4 4 4
##  [511] 2 3 4 3 2 3 3 3 4 3 4 3 4 4 4 3 4 3 2 4 4 3 3 4 3 4 3 4 3 3 3 2 3 3
##  [545] 4 4 1 4 3 4 3 2 4 2 4 3 3 4 3 3 4 2 4 4 4 2 4 4 4 4 4 4 4 4 4 3 2 4
##  [579] 2 4 4 3 4 4 4 4 4 3 3 4 2 4 4 3 1 3 4 4 1 3 4 4 4 4 3 4 2 4 4 4 4 2
##  [613] 4 3 4 4 4 4 3 4 4 3 2 3 4 2 4 4 4 3 4 3 4 4 4 4 3 4 3 3 4 2 2 3 4 4
##  [647] 3 4 4 3 4 3 3 4 4 4 4 4 4 3 3 4 3 2 1 4 4 3 4 3 4 3 3 4 3 4 2 2 4 4
##  [681] 2 4 3 2 4 3 4 2 4 3 2 4 3 4 2 2 3 2 3 4 4 4 4 4 4 4 4 3 4 4 3 4 2 4
##  [715] 4 4 4 4 4 4 2 4 4 4 4 3 4 3 4 3 1 4 4 3 2 4 3 3 4 4 3 3 4 4 4 3 2 4
##  [749] 4 2 3 4 4 4 4 4 3 4 4 3 4 1 4 1 4 4 4 2 4 3 4 4 2 4 1 3 3 3 4 1 3 4
##  [783] 4 3 2 4 2 4 4 3 4 3 4 4 1 4 2 3 3 3 2 4 3 4 4 4 4 2 1 2 4 3 4 4 4 3
##  [817] 4 3 3 1 4 3 3 2 4 3 3 2 4 3 4 3 4 4 4 4 3 4 4 4 4 4 4 3 2 4 2 3 3 3
##  [851] 4 4 4 4 3 3 4 4 4 3 3 2 4 4 4 4 1 4 2 4 4 4 4 3 4 4 4 2 4 4 4 4 1 4
##  [885] 1 4 4 4 4 4 2 4 1 4 1 4 4 4 4 3 4 1 4 4 4 4 3 4 3 3 3 4 3 3 2 3 4 4
##  [919] 4 1 4 2 4 4 4 4 3 4 3 4 4 3 1 4 4 4 3 4 2 4 4 3 4 3 4 4 3 2 4 4 4 1
##  [953] 4 4 1 4 4 4 4 4 3 2 3 4 3 3 2 3 3 4 4 4 2 4 4 2 4 3 1 4 4 2 4 1 4 4
##  [987] 3 3 3 3 3 4 3 4 3 3 2 4 3 4 4 4 4 4 4 3 4 3 3 4 3 4 3 2 4 4 4 3 4 3
## [1021] 4 3 2 2 4 2 4 4 4 4 2 4 2 3 3 2 3 4 1 4 3 3 3 4 3 4 2 4 4 3 3 4 2 3
## [1055] 3 4 3 4 3 3 4 2 3 4 4 3 4 3 4 4 4 4 4 4 4 3 4 4 4 4 3 3 4 2 3 4 3 3
## [1089] 2 2 2 2 4 4 3 2 4 4 4 3 2 2 3 4 3 2 4 2 4 4 3 4 4 4 3 4 4 4 3 3 4 3
## [1123] 3 3 4 3 3 4 2 3 4 4 2 4 2 2 2 4 3 4 4 3 3 2 2 4 2 4 3 3 2 4 3 2 4 3
## [1157] 3 4 4 4 4 4 4 2 1 4 2 2 4 4 2 4 4 1 2 4 4 4 3 3 3 1 4 2 3 4 1 4 4 2
## [1191] 3 2 4 4 1 4 4 4
## 
## [[5]][[2]]
## [1] 1198
## 
## 
## [[6]]
## NULL
fnlwgttrainout <- tail(order(rank(newtrain[, 3])), 15)
fnlout <- c()
for(i in 1:length(fnlwgttrainout)){
  fnlout[i] <- newtrain[fnlwgttrainout[i], 3]
}

#head(order(rank(newtrain[,5])))
table(newtrain[, 11])
## 
##     0   114   401   594   914   991  1055  1086  1111  1151  1173  1409 
## 29849     6     2    34     8     5    25     4     1     8     3     7 
##  1424  1455  1471  1506  1639  1797  1831  1848  2009  2036  2050  2062 
##     3     1     7    15     1     7     7     6     3     4     5     2 
##  2105  2174  2176  2202  2228  2290  2329  2346  2354  2387  2407  2414 
##     9    48    23    16     5     5     6     6    11     1    19     8 
##  2463  2538  2580  2597  2635  2653  2829  2885  2907  2936  2961  2964 
##    11     1    12    20    11     5    31    24    11     3     3     9 
##  2977  2993  3103  3137  3273  3325  3411  3418  3432  3456  3464  3471 
##     8     2    97    37     6    53    24     5     4     2    23     8 
##  3674  3781  3818  3887  3908  3942  4064  4101  4386  4416  4508  4650 
##    14    12     7     6    32    14    42    20    70    12    12    41 
##  4687  4787  4865  4931  4934  5013  5060  5178  5455  5556  5721  6097 
##     3    23    17     1     7    69     1    97    11     5     3     1 
##  6360  6418  6497  6514  6723  6767  6849  7298  7430  7443  7688  7896 
##     3     9    11     5     2     5    27   246     9     5   284     3 
##  7978  8614  9386  9562 10520 10566 10605 11678 13550 14084 14344 15020 
##     1    55    22     4    43     6    12     2    27    41    26     5 
## 15024 15831 18481 20051 22040 25124 25236 27828 34095 41310 99999 
##   347     6     2    37     1     4    11    34     5     2   159
gainout <- tail(order(rank(newtrain[, 11])), 159)



#Outliers removing for training sets.
dim(newtrain)
## [1] 32561    15
newtrain <- newtrain[-gainout, ]
dim(newtrain)
## [1] 32402    15
#Deal with outliers for testing sets
for(i in continuouscol){
  boxplot(newtest[, i], main = paste("boxplot for", colnames(newtest[i])),
          xlab = colnames(newtest[i]))
}

for(i in continuouscol){
  den_acc <- density(newtest[, i], adjust = 1)
  plot(den_acc, main = paste("density plot for", colnames(newtest[i])))
  polygon(den_acc, col = "red", border = "blue")
}

outlierstest <- list()
for(i in continuouscol){
  outliers <- boxplot.stats(newtest[, i])$out
  numbers <- length(outliers)
  outlierstest[[i]] <- list(outliers, numbers)
}
head(outlierstest)
## [[1]]
## [[1]][[1]]
##  [1] 79 80 90 79 80 81 82 83 81 85 80 90 81 84 81 89 81 83 81 82 80 90 81
## [24] 83 80 90 90 84 80 80 80 81 90 85 90 81 81 80 80 79 81 80 88 87 90 79
## [47] 83 79 80 90 79 79 81 81 90 82 90 87 81 88 80 81 80 81 90 88 89 84 80
## [70] 80 83 79 81
## 
## [[1]][[2]]
## [1] 73
## 
## 
## [[2]]
## NULL
## 
## [[3]]
## [[3]][[1]]
##   [1]  444554  432824  465326  445382  479296  428420  456736  537222
##   [9]  513100  447488  512864  500068  446894  599057  479179  471990
##  [17]  457162  455379  542610  479600  448026  437200  652784  573446
##  [25]  453233  662460  426589  629900  499971  450770  481987  478373
##  [33]  486194  509364  632733  504725  560313  651702  644278  535852
##  [41]  445758  452353  475775  455469  522241  427744  473206  427541
##  [49]  581128  444725  608881  490871  430151  431245  451019  430336
##  [57]  433602  437994  436431  914061  624006  510072  484475  505365
##  [65]  593246  714597  816750  491214  446724  552529  454717  425622
##  [73]  575172  475322  622192  566066  493732  427437  427320  614113
##  [81]  445365  472517  459556  548568  565769  429832  424988  426350
##  [89]  789600  424340  447144  864960  497414  471876  723746  427422
##  [97]  421837  692831  535869  433624  638116  467936  698039  427812
## [105]  472861  449101  677398  464621  547931  497039  451742  460322
## [113]  666014  474568  452640  765214  445480  761800  460356 1047822
## [121]  436651  544319  617917  450695  429696  443377  522881  437161
## [129]  421010  479296  459189  469005  457070  750972  505365  458609
## [137]  520231  589155  538193  428251  454321  455399  477345  470486
## [145]  437318  588739  449578  486436  588484  449101  528618  806552
## [153]  478354  467936  505168  858091  451327  482082  663291  447554
## [161]  451603  455995  460408  581025  453983  656488  421633  478457
## [169]  422836  557349  421350  498267  442478  421228  655066  426431
## [177]  494371  737315  541755  436198  594521  442656  491000  455995
## [185]  430672  496856  589838  479296  605504  490332  423453  445382
## [193]  558752  448862  429281  772919  884434  495288  488720  444554
## [201]  604045  437940  697806  632271  497788  464484  587310  467759
## [209]  472344  438587  427055  538243  441227  459465  454950  439777
## [217] 1490400  768659  764638  437458  517995  718736  433682  477083
## [225]  442478  547108  474229  498833  882849  453663  443508  498411
## [233]  504423  746660  488459  423883  457357  501671  786418  565313
## [241]  483201  466458  424934  450200  465334  482096  451603  465725
## [249]  502633  473133  477867  435356  478457  653215  437825  576645
## [257]  510643  538099  425502  432480  482211  539019  496743  455379
## [265]  421132  452402  531055  454076  434081  452402  434710  446947
## [273]  472411  594187  685955  442116  435835  430278  548361  606111
## [281]  459192  592029  426263  513977  647591  566066  553588  433325
## [289]  491607  624572  488706  535740  607118  482677  420973  426431
## [297]  580591  449172  438427  557853  446390  487751  469263  478972
## [305]  441949  430930  635913  485944  557805  626493  444134  433580
## [313]  493034  914061  456736  557349  443336  953588  473547  457710
## [321]  471768  558344  421871  430710  481258  590204  679853  421474
## [329]  443809  516701  443546  535762  438321  814850  427812  874728
## [337]  497525  434102  450141  441949  438429  506830  478277  594194
## [345]  445480  452963  498267  538583  602513  589809  421474  507492
## [353]  546118  446647  530099  453686  443377 1117718  427248  461725
## [361]  460259  849067  590941  572285  608441  720428  423311  436361
## [369]  463601  557359  454024  431515  590522  443546  433592  479406
## [377]  430195  421633  428299  484911  478836  513440  744929  534775
## [385]  511231  598995  456592  525848  442359  458168  457453  913447
## [393]  584259  694105  441227  448841  606347  437566  495366 1024535
## [401]  427474  811615  431551  461929  533660  445382  427475 1210504
## [409]  426263  425830  421837  427770  447210  455995  435836  425816
## [417]  490645  513977  553405  497414  742903  431745  553405  504941
## [425]  450141  456665  449376  487770  448026  443858  473449  440934
## [433]  456430  421200  426589  484879  438696  435638  535027  464552
## [441]  443701  438427  513719  439263  425444  454585  428251  618130
## [449]  542762  771836  473133  464552  435266  437161  462964  423605
## [457]  618808  573446  432204  461484  455379  504871  532969  455665
## [465]  425127  449925  427515  607658  422933  430340  440129
## 
## [[3]][[2]]
## [1] 471
## 
## 
## [[4]]
## NULL
## 
## [[5]]
## [[5]][[1]]
##   [1] 4 4 3 4 4 4 4 4 4 3 2 3 4 4 2 4 4 3 3 2 4 3 3 4 3 3 4 4 4 1 1 4 3 2 4
##  [36] 4 2 3 4 4 1 4 1 4 4 4 3 4 4 3 4 3 4 2 4 2 4 4 4 3 4 2 4 4 3 3 1 1 4 3
##  [71] 4 2 3 4 3 3 3 4 4 4 4 4 3 3 3 2 2 4 4 4 4 3 3 4 3 3 3 3 1 2 3 3 3 1 4
## [106] 4 4 4 4 4 4 4 2 3 4 4 3 4 4 4 3 3 3 4 4 1 4 4 4 3 4 2 4 2 4 4 4 4 3 3
## [141] 4 4 1 4 3 4 4 4 3 4 4 4 3 3 3 4 2 2 4 2 4 4 4 4 4 4 4 4 4 2 4 4 3 4 1
## [176] 2 3 4 3 2 4 1 4 2 3 3 4 4 4 1 2 2 4 3 4 4 4 4 3 2 4 4 4 4 3 3 3 4 3 4
## [211] 2 4 4 4 3 4 3 2 4 4 3 4 2 2 4 1 2 3 4 2 4 4 4 4 4 2 4 4 4 3 4 3 4 3 4
## [246] 3 4 3 4 3 4 4 4 4 3 3 3 2 3 4 3 4 4 4 3 1 2 2 2 2 3 1 2 3 4 4 4 1 1 2
## [281] 4 4 4 4 2 4 3 4 3 1 3 3 1 3 4 4 4 4 4 4 3 3 3 3 3 3 4 4 4 4 3 4 4 3 2
## [316] 4 4 2 4 4 3 4 3 4 4 4 4 4 2 3 4 4 3 2 4 2 4 4 4 4 2 3 4 4 3 3 4 3 2 3
## [351] 4 2 3 4 4 3 4 4 2 4 4 3 2 4 4 4 2 4 4 4 3 4 3 3 4 2 4 2 3 3 3 4 3 4 3
## [386] 4 1 4 3 4 4 3 4 2 4 2 3 3 4 3 2 1 1 2 3 3 4 3 1 3 3 2 4 3 4 3 3 3 4 3
## [421] 4 4 2 3 3 3 3 1 3 3 2 4 3 4 1 2 3 4 4 4 4 4 4 3 3 2 3 4 4 3 4 2 4 4 4
## [456] 4 4 2 4 2 4 2 4 4 3 4 3 2 4 3 4 4 3 4 4 4 4 4 3 4 4 3 4 3 4 4 3 2 4 2
## [491] 2 4 2 4 3 4 4 3 4 3 4 3 4 1 1 4 3 2 4 4 4 4 3 3 4 4 2 4 4 4 3 4 3 1 4
## [526] 3 3 4 3 4 4 4 4 4 4 4 4 4 3 2 3 4 3 4 4 4 4 4 3 4 4 3 4 3 4 2 2 3 2 3
## [561] 3 3 4 4 4 1 3 3 3 4 4 1 3 4 2 3 3 3 2 3 3 4 4 4 3 4 4 1 4 4 4 4 4 4 4
## [596] 4
## 
## [[5]][[2]]
## [1] 596
## 
## 
## [[6]]
## NULL
table(newtest[, 11])
## 
##     0   114   401   594   914   991  1055  1086  1151  1173  1264  1409 
## 14958     2     3    18     2     1    12     4     5     2     2     3 
##  1424  1455  1471  1506  1731  1797  1831  1848  2036  2062  2105  2174 
##     1     3     2     9     1     3     2     3     1     1     6    26 
##  2176  2202  2290  2329  2346  2354  2407  2414  2463  2538  2580  2597 
##     8    12     5     1     2    10     6     2     4     4     8    11 
##  2635  2653  2829  2885  2907  2936  2961  2964  2977  2993  3103  3137 
##     3     6    11     6     7     1     1     5     3     1    55    14 
##  3273  3325  3411  3418  3456  3464  3471  3674  3781  3818  3887  3908 
##     1    28    10     3     4    10     3     8     4     4     2    10 
##  3942  4064  4101  4386  4416  4508  4650  4687  4787  4865  4931  4934 
##     4    12     9    38    12    11    22     1    12     8     3     3 
##  5013  5060  5178  5455  5556  5721  6097  6418  6497  6514  6612  6723 
##    48     1    49     7     1     4     1     7     4     5     1     3 
##  6767  6849  7262  7298  7430  7443  7688  7896  7978  8614  9386  9562 
##     1    15     1   118     6     2   126     1     1    27     9     1 
## 10520 10566 10605 11678 13550 14084 14344 15020 15024 15831 20051 25124 
##    21     2     7     2    15     8     8     5   166     2    12     2 
## 25236 27828 34095 41310 99999 
##     3    24     1     1    85
gainout <- tail(order(rank(newtest[, 11])), 85)



#Outliers removing for training sets.
dim(newtest)
## [1] 16281    15
newtest <- newtest[-gainout, ]
dim(newtest)
## [1] 16196    15
#Plots after removing outliers training
for(i in continuouscol){
  boxplot(newtrain[, i], main = paste("boxplot for", colnames(newtrain[i]), "-outliers removed"),
          xlab = colnames(newtrain[i]))
}

for(i in continuouscol){
  den_acc <- density(newtrain[, i], adjust = 1)
  plot(den_acc, main = paste("density plot for", colnames(newtrain[i]), "-outliers removed"))
  polygon(den_acc, col = "red", border = "blue")
}

#Plots after removing outliers testing
for(i in continuouscol){
  boxplot(newtest[, i], main = paste("boxplot for", colnames(newtest[i]), "-outliers removed"),
          xlab = colnames(newtest[i]))
}

for(i in continuouscol){
  den_acc <- density(newtest[, i], adjust = 1)
  plot(den_acc, main = paste("density plot for", colnames(newtest[i]), "-outliers removed"))
  polygon(den_acc, col = "red", border = "blue")
}

\(\\\)

\(\\\)

Check-ups before discretizing and dummifying

#detach("package:plyr", unload=TRUE) #because plyr and dplyr existed together conflicting...

#Check whether categorical variables can be discretized....
plot(newtrain$workclass)

table(newtrain$workclass)
## 
##      Federal-gov        Local-gov     Never-worked          Private 
##              983             2187                9            23984 
##     Self-emp-inc Self-emp-not-inc        State-gov      Without-pay 
##             1127             2747             1351               14
newtrain %>% group_by(workclass) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 8 x 3
##          workclass     n         freq
##             <fctr> <int>        <dbl>
## 1      Federal-gov   983 0.0303376335
## 2        Local-gov  2187 0.0674958336
## 3     Never-worked     9 0.0002777606
## 4          Private 23984 0.7402012221
## 5     Self-emp-inc  1127 0.0347818036
## 6 Self-emp-not-inc  2747 0.0847787174
## 7        State-gov  1351 0.0416949571
## 8      Without-pay    14 0.0004320721
plot(newtest$workclass)

table(newtest$workclass)
## 
##      Federal-gov        Local-gov     Never-worked          Private 
##              480             1089                3            11919 
##     Self-emp-inc Self-emp-not-inc        State-gov      Without-pay 
##              570             1421              707                7
newtest %>% group_by(workclass) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 8 x 3
##          workclass     n         freq
##             <fctr> <int>        <dbl>
## 1      Federal-gov   480 0.0296369474
## 2        Local-gov  1089 0.0672388244
## 3     Never-worked     3 0.0001852309
## 4          Private 11919 0.7359224500
## 5     Self-emp-inc   570 0.0351938750
## 6 Self-emp-not-inc  1421 0.0877377130
## 7        State-gov   707 0.0436527538
## 8      Without-pay     7 0.0004322055
plot(newtrain$education)

table(newtrain$education)
## 
##         10th         11th         12th      1st-4th      5th-6th 
##          931         1175          433          168          333 
##      7th-8th          9th   Assoc-acdm    Assoc-voc    Bachelors 
##          646          513         1066         1381         5314 
##    Doctorate      HS-grad      Masters    Preschool  Prof-school 
##          401        10478         1705           51          530 
## Some-college 
##         7277
newtrain %>% group_by(education) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 16 x 3
##       education     n        freq
##          <fctr> <int>       <dbl>
##  1         10th   931 0.028732794
##  2         11th  1175 0.036263194
##  3         12th   433 0.013363373
##  4      1st-4th   168 0.005184865
##  5      5th-6th   333 0.010277143
##  6      7th-8th   646 0.019937041
##  7          9th   513 0.015832356
##  8   Assoc-acdm  1066 0.032899204
##  9    Assoc-voc  1381 0.042620826
## 10    Bachelors  5314 0.164002222
## 11    Doctorate   401 0.012375779
## 12      HS-grad 10478 0.323375100
## 13      Masters  1705 0.052620209
## 14    Preschool    51 0.001573977
## 15  Prof-school   530 0.016357015
## 16 Some-college  7277 0.224584902
plot(newtest$education)

table(newtest$education)
## 
##         10th         11th         12th      1st-4th      5th-6th 
##          456          637          224           79          175 
##      7th-8th          9th   Assoc-acdm    Assoc-voc    Bachelors 
##          309          242          534          677         2648 
##    Doctorate      HS-grad      Masters    Preschool  Prof-school 
##          170         5272          922           32          236 
## Some-college 
##         3583
newtest %>% group_by(education) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 16 x 3
##       education     n        freq
##          <fctr> <int>       <dbl>
##  1         10th   456 0.028155100
##  2         11th   637 0.039330699
##  3         12th   224 0.013830575
##  4      1st-4th    79 0.004877748
##  5      5th-6th   175 0.010805137
##  6      7th-8th   309 0.019078785
##  7          9th   242 0.014941961
##  8   Assoc-acdm   534 0.032971104
##  9    Assoc-voc   677 0.041800445
## 10    Bachelors  2648 0.163497160
## 11    Doctorate   170 0.010496419
## 12      HS-grad  5272 0.325512472
## 13      Masters   922 0.056927636
## 14    Preschool    32 0.001975796
## 15  Prof-school   236 0.014571499
## 16 Some-college  3583 0.221227464
plot(newtrain$marital.status)

table(newtrain$marital.status)
## 
##              Divorced     Married-AF-spouse    Married-civ-spouse 
##                  4432                    23                 14844 
## Married-spouse-absent         Never-married             Separated 
##                   417                 10671                  1023 
##               Widowed 
##                   992
newtrain %>% group_by(marital.status) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 7 x 3
##          marital.status     n         freq
##                  <fctr> <int>        <dbl>
## 1              Divorced  4432 0.1367816801
## 2     Married-AF-spouse    23 0.0007098327
## 3    Married-civ-spouse 14844 0.4581198691
## 4 Married-spouse-absent   417 0.0128695760
## 5         Never-married 10671 0.3293315227
## 6             Separated  1023 0.0315721252
## 7               Widowed   992 0.0306153941
plot(newtest$marital.status)

table(newtest$marital.status)
## 
##              Divorced     Married-AF-spouse    Married-civ-spouse 
##                  2181                    13                  7340 
## Married-spouse-absent         Never-married             Separated 
##                   210                  5425                   503 
##               Widowed 
##                   524
newtest %>% group_by(marital.status) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 7 x 3
##          marital.status     n         freq
##                  <fctr> <int>        <dbl>
## 1              Divorced  2181 0.1346628797
## 2     Married-AF-spouse    13 0.0008026673
## 3    Married-civ-spouse  7340 0.4531983206
## 4 Married-spouse-absent   210 0.0129661645
## 5         Never-married  5425 0.3349592492
## 6             Separated   503 0.0310570511
## 7               Widowed   524 0.0323536676
plot(newtrain$occupation)

table(newtrain$occupation)
## 
##      Adm-clerical      Armed-Forces      Craft-repair   Exec-managerial 
##              3986                 9              4154              4085 
##   Farming-fishing Handlers-cleaners Machine-op-inspct     Other-service 
##              1185              1617              2184              3694 
##   Priv-house-serv    Prof-specialty   Protective-serv             Sales 
##               206              4228               734              3690 
##      Tech-support  Transport-moving 
##               992              1638
newtrain %>% group_by(occupation) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 14 x 3
##           occupation     n         freq
##               <fctr> <int>        <dbl>
##  1      Adm-clerical  3986 0.1230170977
##  2      Armed-Forces     9 0.0002777606
##  3      Craft-repair  4154 0.1282019628
##  4   Exec-managerial  4085 0.1260724647
##  5   Farming-fishing  1185 0.0365718166
##  6 Handlers-cleaners  1617 0.0499043269
##  7 Machine-op-inspct  2184 0.0674032467
##  8     Other-service  3694 0.1140053083
##  9   Priv-house-serv   206 0.0063576322
## 10    Prof-specialty  4228 0.1304857725
## 11   Protective-serv   734 0.0226529227
## 12             Sales  3690 0.1138818591
## 13      Tech-support   992 0.0306153941
## 14  Transport-moving  1638 0.0505524350
plot(newtest$occupation)

table(newtest$occupation)
## 
##      Adm-clerical      Armed-Forces      Craft-repair   Exec-managerial 
##              1965                 6              2032              2009 
##   Farming-fishing Handlers-cleaners Machine-op-inspct     Other-service 
##               576               864              1085              1824 
##   Priv-house-serv    Prof-specialty   Protective-serv             Sales 
##               133              2077               367              1912 
##      Tech-support  Transport-moving 
##               548               798
newtest %>% group_by(occupation) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 14 x 3
##           occupation     n         freq
##               <fctr> <int>        <dbl>
##  1      Adm-clerical  1965 0.1213262534
##  2      Armed-Forces     6 0.0003704618
##  3      Craft-repair  2032 0.1254630773
##  4   Exec-managerial  2009 0.1240429736
##  5   Farming-fishing   576 0.0355643369
##  6 Handlers-cleaners   864 0.0533465053
##  7 Machine-op-inspct  1085 0.0669918498
##  8     Other-service  1824 0.1126204001
##  9   Priv-house-serv   133 0.0082119042
## 10    Prof-specialty  2077 0.1282415411
## 11   Protective-serv   367 0.0226599160
## 12             Sales  1912 0.1180538405
## 13      Tech-support   548 0.0338355149
## 14  Transport-moving   798 0.0492714250
plot(newtrain$relationship)

table(newtrain$relationship)
## 
##        Husband  Not-in-family Other-relative      Own-child      Unmarried 
##          13072           8284            981           5066           3442 
##           Wife 
##           1557
newtrain %>% group_by(relationship) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 6 x 3
##     relationship     n       freq
##           <fctr> <int>      <dbl>
## 1        Husband 13072 0.40343189
## 2  Not-in-family  8284 0.25566323
## 3 Other-relative   981 0.03027591
## 4      Own-child  5066 0.15634837
## 5      Unmarried  3442 0.10622801
## 6           Wife  1557 0.04805259
plot(newtest$relationship)

table(newtest$relationship)
## 
##        Husband  Not-in-family Other-relative      Own-child      Unmarried 
##           6465           4262            525           2511           1676 
##           Wife 
##            757
newtest %>% group_by(relationship) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 6 x 3
##     relationship     n       freq
##           <fctr> <int>      <dbl>
## 1        Husband  6465 0.39917264
## 2  Not-in-family  4262 0.26315140
## 3 Other-relative   525 0.03241541
## 4      Own-child  2511 0.15503828
## 5      Unmarried  1676 0.10348234
## 6           Wife   757 0.04673994
plot(newtrain$race)

table(newtrain$race)
## 
## Amer-Indian-Eskimo Asian-Pac-Islander              Black 
##                311               1029               3117 
##              Other              White 
##                269              27676
newtrain %>% group_by(race) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 5 x 3
##                 race     n        freq
##               <fctr> <int>       <dbl>
## 1 Amer-Indian-Eskimo   311 0.009598173
## 2 Asian-Pac-Islander  1029 0.031757299
## 3              Black  3117 0.096197766
## 4              Other   269 0.008301957
## 5              White 27676 0.854144806
plot(newtest$race)

table(newtest$race)
## 
## Amer-Indian-Eskimo Asian-Pac-Islander              Black 
##                159                475               1558 
##              Other              White 
##                134              13870
newtest %>% group_by(race) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 5 x 3
##                 race     n        freq
##               <fctr> <int>       <dbl>
## 1 Amer-Indian-Eskimo   159 0.009817239
## 2 Asian-Pac-Islander   475 0.029328229
## 3              Black  1558 0.096196592
## 4              Other   134 0.008273648
## 5              White 13870 0.856384292
plot(newtrain$sex)

table(newtrain$sex)
## 
## Female   Male 
##  10749  21653
newtrain %>% group_by(sex) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 2 x 3
##      sex     n      freq
##   <fctr> <int>     <dbl>
## 1 Female 10749 0.3317388
## 2   Male 21653 0.6682612
plot(newtest$sex)

table(newtest$sex)
## 
## Female   Male 
##   5407  10789
newtest %>% group_by(sex) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 2 x 3
##      sex     n      freq
##   <fctr> <int>     <dbl>
## 1 Female  5407 0.3338479
## 2   Male 10789 0.6661521
plot(newtrain$native.country)

table(newtrain$native.country)
## 
##                   Cambodia                     Canada 
##                         20                        120 
##                      China                   Columbia 
##                         79                         59 
##                       Cuba         Dominican-Republic 
##                         95                         70 
##                    Ecuador                El-Salvador 
##                         28                        106 
##                    England                     France 
##                         90                         29 
##                    Germany                     Greece 
##                        137                         29 
##                  Guatemala                      Haiti 
##                         64                         44 
##         Holand-Netherlands                   Honduras 
##                          1                         13 
##                       Hong                    Hungary 
##                         23                         13 
##                      India                       Iran 
##                        104                         43 
##                    Ireland                      Italy 
##                         24                         74 
##                    Jamaica                      Japan 
##                         81                         66 
##                       Laos                     Mexico 
##                         22                        656 
##                  Nicaragua Outlying-US(Guam-USVI-etc) 
##                         34                         14 
##                       Peru                Philippines 
##                         31                        210 
##                     Poland                   Portugal 
##                         60                         37 
##                Puerto-Rico                   Scotland 
##                        114                         12 
##                      South                     Taiwan 
##                         89                         56 
##                   Thailand            Trinadad&Tobago 
##                         19                         19 
##              United-States                    Vietnam 
##                      29528                         73 
##                 Yugoslavia 
##                         16
newtrain %>% group_by(native.country) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 41 x 3
##        native.country     n         freq
##                <fctr> <int>        <dbl>
##  1           Cambodia    20 0.0006172458
##  2             Canada   120 0.0037034751
##  3              China    79 0.0024381211
##  4           Columbia    59 0.0018208753
##  5               Cuba    95 0.0029319178
##  6 Dominican-Republic    70 0.0021603605
##  7            Ecuador    28 0.0008641442
##  8        El-Salvador   106 0.0032714030
##  9            England    90 0.0027776063
## 10             France    29 0.0008950065
## # ... with 31 more rows
plot(newtest$native.country)

table(newtest$native.country)
## 
##                   Cambodia                     Canada 
##                         12                         61 
##                      China                   Columbia 
##                         50                         26 
##                       Cuba         Dominican-Republic 
##                         43                         34 
##                    Ecuador                El-Salvador 
##                         17                         49 
##                    England                     France 
##                         38                          9 
##                    Germany                     Greece 
##                         69                         20 
##                  Guatemala                      Haiti 
##                         24                         31 
##                   Honduras                       Hong 
##                          7                         10 
##                    Hungary                      India 
##                          6                         56 
##                       Iran                    Ireland 
##                         16                         13 
##                      Italy                    Jamaica 
##                         32                         25 
##                      Japan                       Laos 
##                         32                          5 
##                     Mexico                  Nicaragua 
##                        310                         15 
## Outlying-US(Guam-USVI-etc)                       Peru 
##                          9                         15 
##                Philippines                     Poland 
##                        109                         27 
##                   Portugal                Puerto-Rico 
##                         30                         70 
##                   Scotland                      South 
##                          9                         37 
##                     Taiwan                   Thailand 
##                         17                         13 
##            Trinadad&Tobago              United-States 
##                          8                      14813 
##                    Vietnam                 Yugoslavia 
##                         22                          7
newtest %>% group_by(native.country) %>%  summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 40 x 3
##        native.country     n         freq
##                <fctr> <int>        <dbl>
##  1           Cambodia    12 0.0007409237
##  2             Canada    61 0.0037663621
##  3              China    50 0.0030871820
##  4           Columbia    26 0.0016053347
##  5               Cuba    43 0.0026549765
##  6 Dominican-Republic    34 0.0020992838
##  7            Ecuador    17 0.0010496419
##  8        El-Salvador    49 0.0030254384
##  9            England    38 0.0023462583
## 10             France     9 0.0005556928
## # ... with 30 more rows
#Check collinearity issues
newtrain %>% group_by(education) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 16 x 3
##       education     n        freq
##          <fctr> <int>       <dbl>
##  1         10th   931 0.028732794
##  2         11th  1175 0.036263194
##  3         12th   433 0.013363373
##  4      1st-4th   168 0.005184865
##  5      5th-6th   333 0.010277143
##  6      7th-8th   646 0.019937041
##  7          9th   513 0.015832356
##  8   Assoc-acdm  1066 0.032899204
##  9    Assoc-voc  1381 0.042620826
## 10    Bachelors  5314 0.164002222
## 11    Doctorate   401 0.012375779
## 12      HS-grad 10478 0.323375100
## 13      Masters  1705 0.052620209
## 14    Preschool    51 0.001573977
## 15  Prof-school   530 0.016357015
## 16 Some-college  7277 0.224584902
newtrain %>% group_by(education.num) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
## # A tibble: 16 x 3
##    education.num     n        freq
##            <int> <int>       <dbl>
##  1             1    51 0.001573977
##  2             2   168 0.005184865
##  3             3   333 0.010277143
##  4             4   646 0.019937041
##  5             5   513 0.015832356
##  6             6   931 0.028732794
##  7             7  1175 0.036263194
##  8             8   433 0.013363373
##  9             9 10478 0.323375100
## 10            10  7277 0.224584902
## 11            11  1381 0.042620826
## 12            12  1066 0.032899204
## 13            13  5314 0.164002222
## 14            14  1705 0.052620209
## 15            15   530 0.016357015
## 16            16   401 0.012375779
newtrain <- newtrain[, -4]
newtest <- newtest[, -4]

\(\\\)

\(\\\)

c) 6 - 8 EDAs

#Find correlations of the data - for collinearity issue checks
cor(newtest[, c(1, 3, 4, 10, 12)])
##                        age        fnlwgt education.num  capital.gain
## age             1.00000000 -0.0759176992    0.01555523  0.1080390077
## fnlwgt         -0.07591770  1.0000000000   -0.02926279 -0.0007549241
## education.num   0.01555523 -0.0292627902    1.00000000  0.1417220957
## capital.gain    0.10803901 -0.0007549241    0.14172210  1.0000000000
## hours.per.week  0.07425722 -0.0026773627    0.12954445  0.0833160656
##                hours.per.week
## age               0.074257217
## fnlwgt           -0.002677363
## education.num     0.129544454
## capital.gain      0.083316066
## hours.per.week    1.000000000
cor(newtrain[, c(1, 3, 4, 10, 12)])
##                        age       fnlwgt education.num capital.gain
## age             1.00000000 -0.076917052    0.03330048  0.116518227
## fnlwgt         -0.07691705  1.000000000   -0.04362125 -0.004506565
## education.num   0.03330048 -0.043621248    1.00000000  0.145735884
## capital.gain    0.11651823 -0.004506565    0.14573588  1.000000000
## hours.per.week  0.06774934 -0.019547738    0.14384089  0.082952143
##                hours.per.week
## age                0.06774934
## fnlwgt            -0.01954774
## education.num      0.14384089
## capital.gain       0.08295214
## hours.per.week     1.00000000
#remove fnlwght variable.
newtrain <- newtrain[, -3]
newtest <- newtest[, -3]



#See structure and summaries after removing outliers
str(newtest)
## 'data.frame':    16196 obs. of  13 variables:
##  $ age           : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ workclass     : Factor w/ 8 levels "Federal-gov",..: 4 4 2 4 4 4 4 6 4 4 ...
##  $ education.num : int  7 9 12 10 10 6 9 15 10 4 ...
##  $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 3 3 5 5 5 3 5 3 ...
##  $ occupation    : Factor w/ 14 levels "Adm-clerical",..: 7 5 11 7 12 8 6 10 8 3 ...
##  $ relationship  : Factor w/ 6 levels "Husband","Not-in-family",..: 4 1 1 1 4 2 5 1 5 1 ...
##  $ race          : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 5 5 3 5 5 3 5 5 5 ...
##  $ sex           : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 2 2 2 1 2 ...
##  $ capital.gain  : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: int  40 50 40 40 30 30 40 32 40 10 ...
##  $ native.country: Factor w/ 40 levels "Cambodia","Canada",..: 38 38 38 38 38 38 38 38 38 38 ...
##  $ income        : Factor w/ 2 levels "<=50K.",">50K.": 1 1 2 2 1 1 1 2 1 1 ...
summary(newtest)
##       age                   workclass     education.num  
##  Min.   :17.00   Private         :11919   Min.   : 1.00  
##  1st Qu.:28.00   Self-emp-not-inc: 1421   1st Qu.: 9.00  
##  Median :37.00   Local-gov       : 1089   Median :10.00  
##  Mean   :38.72   State-gov       :  707   Mean   :10.06  
##  3rd Qu.:48.00   Self-emp-inc    :  570   3rd Qu.:12.00  
##  Max.   :90.00   Federal-gov     :  480   Max.   :16.00  
##                  (Other)         :   10                  
##                marital.status           occupation           relationship 
##  Divorced             :2181   Prof-specialty :2077   Husband       :6465  
##  Married-AF-spouse    :  13   Craft-repair   :2032   Not-in-family :4262  
##  Married-civ-spouse   :7340   Exec-managerial:2009   Other-relative: 525  
##  Married-spouse-absent: 210   Adm-clerical   :1965   Own-child     :2511  
##  Never-married        :5425   Sales          :1912   Unmarried     :1676  
##  Separated            : 503   Other-service  :1824   Wife          : 757  
##  Widowed              : 524   (Other)        :4377                        
##                  race           sex         capital.gain    
##  Amer-Indian-Eskimo:  159   Female: 5407   Min.   :    0.0  
##  Asian-Pac-Islander:  475   Male  :10789   1st Qu.:    0.0  
##  Black             : 1558                  Median :    0.0  
##  Other             :  134                  Mean   :  562.8  
##  White             :13870                  3rd Qu.:    0.0  
##                                            Max.   :41310.0  
##                                                             
##   capital.loss     hours.per.week        native.country     income     
##  Min.   :   0.00   Min.   : 1.00   United-States:14813   <=50K.:12435  
##  1st Qu.:   0.00   1st Qu.:40.00   Mexico       :  310   >50K. : 3761  
##  Median :   0.00   Median :40.00   Philippines  :  109                 
##  Mean   :  88.36   Mean   :40.33   Puerto-Rico  :   70                 
##  3rd Qu.:   0.00   3rd Qu.:45.00   Germany      :   69                 
##  Max.   :3770.00   Max.   :99.00   Canada       :   61                 
##                                    (Other)      :  764
str(newtrain)
## 'data.frame':    32402 obs. of  13 variables:
##  $ age           : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ workclass     : Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
##  $ education.num : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ marital.status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
##  $ occupation    : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
##  $ relationship  : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
##  $ race          : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
##  $ sex           : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
##  $ capital.gain  : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital.loss  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week: int  40 13 40 40 40 40 16 45 50 40 ...
##  $ native.country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
##  $ income        : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
summary(newtrain)
##       age                   workclass     education.num  
##  Min.   :17.00   Private         :23984   Min.   : 1.00  
##  1st Qu.:28.00   Self-emp-not-inc: 2747   1st Qu.: 9.00  
##  Median :37.00   Local-gov       : 2187   Median :10.00  
##  Mean   :38.54   State-gov       : 1351   Mean   :10.07  
##  3rd Qu.:48.00   Self-emp-inc    : 1127   3rd Qu.:12.00  
##  Max.   :90.00   Federal-gov     :  983   Max.   :16.00  
##                  (Other)         :   23                  
##                marital.status            occupation  
##  Divorced             : 4432   Prof-specialty :4228  
##  Married-AF-spouse    :   23   Craft-repair   :4154  
##  Married-civ-spouse   :14844   Exec-managerial:4085  
##  Married-spouse-absent:  417   Adm-clerical   :3986  
##  Never-married        :10671   Other-service  :3694  
##  Separated            : 1023   Sales          :3690  
##  Widowed              :  992   (Other)        :8565  
##          relationship                   race           sex       
##  Husband       :13072   Amer-Indian-Eskimo:  311   Female:10749  
##  Not-in-family : 8284   Asian-Pac-Islander: 1029   Male  :21653  
##  Other-relative:  981   Black             : 3117                 
##  Own-child     : 5066   Other             :  269                 
##  Unmarried     : 3442   White             :27676                 
##  Wife          : 1557                                            
##                                                                  
##   capital.gain      capital.loss     hours.per.week        native.country 
##  Min.   :    0.0   Min.   :   0.00   Min.   : 1.00   United-States:29528  
##  1st Qu.:    0.0   1st Qu.:   0.00   1st Qu.:40.00   Mexico       :  656  
##  Median :    0.0   Median :   0.00   Median :40.00   Philippines  :  210  
##  Mean   :  592.2   Mean   :  87.73   Mean   :40.39   Germany      :  137  
##  3rd Qu.:    0.0   3rd Qu.:   0.00   3rd Qu.:45.00   Canada       :  120  
##  Max.   :41310.0   Max.   :4356.00   Max.   :99.00   Puerto-Rico  :  114  
##                                                      (Other)      : 1637  
##    income     
##  <=50K:24720  
##  >50K : 7682  
##               
##               
##               
##               
## 
#Analyzing/checking before discretizing
# table(newtrain[,14])
# table(newtest[,14])
# 
# plot(newtrain$education)
# plot(newtrain$occupation)
# plot(newtrain$native.country)
# 
# plot(newtest$education)
# plot(newtest$occupation)
# plot(newtest$native.country)



#Discretize training set
# discretetrainage <- discretize(newtrain$age, method = "interval", categories = 10)
# discretetrainfnlwgt <- discretize(newtrain$fnlwgt, method = "interval", categories = 10)
# discretetrainedunum <- discretize(newtrain$education.num, method = "interval", categories = 10)
# discretetraingain <- discretize(newtrain$capital.gain, method = "interval", categories = 10)
# discretetrainloss <- discretize(newtrain$capital.loss, method = "interval", categories = 10)
# discretetrainhours <- discretize(newtrain$hours.per.week, method = "interval", categories = 10)



#Binning
countrydis <- function(vector){
  len <- length(vector)
  for(i in 1:len){
      if(vector[i] == "United-States"){
        vector[i] <- vector[i]
      }else if(vector[i] == "Mexico"){
        vector[i] <- vector[i]
      }else if(vector[i] == "Philippines"){
        vector[i] <- vector[i]
      }else{
        vector[i] <- "other_countries"
      }
  }
  return(vector)
}

workdis <- function(vector){
  len <- length(vector)
  for(i in 1:len){
    if(vector[i] == "Federal-gov"){
      vector[i] <- vector[i]
    }else if(vector[i] == "Local-gov"){
      vector[i] <- vector[i]
    }else if(vector[i] == "Private"){
      vector[i] <- vector[i]
    }else if(vector[i] == "Self-emp-inc"){
      vector[i] <- vector[i]
    }else if(vector[i] == "Self-emp-not-inc"){
      vector[i] <- vector[i]
    }else if(vector[i] == "State-gov"){
      vector[i] <- vector[i]
    }else{
      vector[i] <- "No-gain"
    }
  }
  return(vector)
}

#discretetraincountry <- as.factor(countrydis(as.character(newtrain$native.country)))



#Discretize testing set
# discretetestage <- discretize(newtest$age, method = "interval", categories = 10)
# discretetestfnlwgt <- discretize(newtest$fnlwgt, method = "interval", categories = 10)
# discretetestedunum <- discretize(newtest$education.num, method = "interval", categories = 10)
# discretetestgain <- discretize(newtest$capital.gain, method = "interval", categories = 10)
# discretetestloss <- discretize(newtest$capital.loss, method = "interval", categories = 10)
# discretetesthours <- discretize(newtest$hours.per.week, method = "interval", categories = 10)
# discretetestcountry <- as.factor(countrydis(as.character(newtest$native.country)))
#Combine training and testing to make the same intervals for discretizing



newtrain$type <- "train"
newtest$type <- "test"
combined <- rbind(newtrain, newtest)



# discreteage <- discretize(combined$age, method = "interval", categories = 10)
# discretefnlwgt <- discretize(combined$fnlwgt, method = "interval", categories = 10)
# discreteedunum <- discretize(combined$education.num, method = "interval", categories = 10)
# discretegain <- discretize(combined$capital.gain, method = "interval", categories = 7) #not enough data
# discreteloss <- discretize(combined$capital.loss, method = "interval", categories = 7) #not enough data
# discretehours <- discretize(combined$hours.per.week, method = "interval", categories = 10)
discretecountry <- as.factor(countrydis(as.character(combined$native.country)))
discreteworkclass <- as.factor(workdis(as.character(combined$workclass)))



# combined$age <- discreteage
# combined$fnlwgt <- discretefnlwgt
# combined$education.num <- discreteedunum
# combined$capital.gain <- discretegain
# combined$capital.loss <- discreteloss
# combined$hours.per.week <- discretehours
combined$native.country <- discretecountry
combined$workclass <- discreteworkclass



dim(combined)
## [1] 48598    14
newtrain2 <- combined[1:sum(combined$type == "train"), -14]
newtest2 <- combined[(sum(combined$type == "train") + 1):nrow(combined), -14]
dim(newtrain2)
## [1] 32402    13
dim(newtest2)
## [1] 16196    13
#plots
par(mfrow = c(2, 2)) #set how many plots on the palete.

for(i in 1:12){
  plot(newtrain2[, i], newtrain2[, 13])
}

for(i in 1:12){
  plot(newtest2[, i], newtest2[, 13])
}

#Assignining discretized variables
# newtrain2 <- newtrain
# newtest2 <- newtest
# dim(newtrain2)
# dim(newtest2)
# 
# newtrain2$age <- discretetrainage
# newtrain2$fnlwgt <- discretetrainfnlwgt
# newtrain2$education.num <- discretetrainedunum
# newtrain2$capital.gain <- discretetraingain
# newtrain2$capital.loss <- discretetrainloss
# newtrain2$hours.per.week <- discretetrainhours
# newtrain2$native.country <- discretetraincountry
# 
# newtest2$age <- discretetestage
# newtest2$fnlwgt <- discretetestfnlwgt
# newtest2$education.num <- discretetestedunum
# newtest2$capital.gain <- discretetestgain
# newtest2$capital.loss <- discretetestloss
# newtest2$hours.per.week <- discretetesthours
# newtest2$native.country <- discretetestcountry



#Dummify training set
dumtrainwork <- dummy(newtrain2$workclass)
dumtrainmarry <- dummy(newtrain2$marital.status)
dumtrainoccu <- dummy(newtrain2$occupation)
dumtrainrelation <- dummy(newtrain2$relationship)
dumtrainrace <- dummy(newtrain2$race)
dumtrainsex <- dummy(newtrain2$sex)
dumtraincountry <- dummy(newtrain2$native.country)



#Dummify testing set
dumtestwork <- dummy(newtest2$workclass)
dumtestmarry <- dummy(newtest2$marital.status)
dumtestoccu <- dummy(newtest2$occupation)
dumtestrelation <- dummy(newtest2$relationship)
dumtestrace <- dummy(newtest2$race)
dumtestsex <- dummy(newtest2$sex)
dumtestcountry <- dummy(newtest2$native.country)



#Take out columns
newtrain2 <- newtrain2[, -c(2, 4, 5, 6, 7, 8, 12)]
newtest2 <- newtest2[, -c(2, 4, 5, 6, 7, 8, 12)]



#Assigning dummified variables
newtrain2 <- cbind(newtrain2, dumtrainwork, dumtrainmarry, dumtrainoccu,
                   dumtrainrelation, dumtrainrace, dumtrainsex, dumtraincountry)
newtrain2[, 45] <- newtrain2$income
newtrain2 <- newtrain2[, -6]
names(newtrain2)[44]<- "income"
dim(newtrain2)
## [1] 32402    44
newtest2 <- cbind(newtest2, dumtestwork, dumtestmarry, dumtestoccu,
                   dumtestrelation, dumtestrace, dumtestsex, dumtestcountry)
newtest2[, 45] <- newtest2$income
newtest2 <- newtest2[, -6]
names(newtest2)[44]<- "income"
dim(newtest2)
## [1] 16196    44
#fixing...
newtrain2$income <- droplevels(newtrain2$income, c("<=50K.", ">50K."))
newtest2$income <- droplevels(newtest2$income, c("<=50K", ">50K"))

newtest2$income <- as.character(newtest2$income)
newtest2$income <- substr(newtest2$income, 1, nchar(newtest2$income) - 1)
newtest2$income <- as.factor(newtest2$income)



dim(newtrain2)
## [1] 32402    44
dim(newtest2)
## [1] 16196    44
str(newtrain2)
## 'data.frame':    32402 obs. of  44 variables:
##  $ age                  : int  39 50 38 53 28 37 49 52 31 42 ...
##  $ education.num        : int  13 13 9 7 13 14 5 9 14 13 ...
##  $ capital.gain         : int  2174 0 0 0 0 0 0 0 14084 5178 ...
##  $ capital.loss         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week       : int  40 13 40 40 40 40 16 45 50 40 ...
##  $ Local-gov            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ No-gain              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Private              : num  0 0 1 1 1 1 1 0 1 1 ...
##  $ Self-emp-inc         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Self-emp-not-inc     : num  0 1 0 0 0 0 0 1 0 0 ...
##  $ State-gov            : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ Married-AF-spouse    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Married-civ-spouse   : num  0 1 0 1 1 1 0 1 0 1 ...
##  $ Married-spouse-absent: num  0 0 0 0 0 0 1 0 0 0 ...
##  $ Never-married        : num  1 0 0 0 0 0 0 0 1 0 ...
##  $ Separated            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Widowed              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Armed-Forces         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Craft-repair         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Exec-managerial      : num  0 1 0 0 0 1 0 1 0 1 ...
##  $ Farming-fishing      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Handlers-cleaners    : num  0 0 1 1 0 0 0 0 0 0 ...
##  $ Machine-op-inspct    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Other-service        : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ Priv-house-serv      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Prof-specialty       : num  0 0 0 0 1 0 0 0 1 0 ...
##  $ Protective-serv      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sales                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Tech-support         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Transport-moving     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Not-in-family        : num  1 0 1 0 0 0 1 0 1 0 ...
##  $ Other-relative       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Own-child            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Unmarried            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Wife                 : num  0 0 0 0 1 1 0 0 0 0 ...
##  $ Asian-Pac-Islander   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Black                : num  0 0 0 1 1 0 1 0 0 0 ...
##  $ Other                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ White                : num  1 1 1 0 0 1 0 1 1 1 ...
##  $ Male                 : num  1 1 1 1 0 0 0 1 0 1 ...
##  $ other_countries      : num  0 0 0 0 1 0 1 0 0 0 ...
##  $ Philippines          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ United-States        : num  1 1 1 1 0 1 0 1 1 1 ...
##  $ income               : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
str(newtest2)
## 'data.frame':    16196 obs. of  44 variables:
##  $ age                  : int  25 38 28 44 18 34 29 63 24 55 ...
##  $ education.num        : int  7 9 12 10 10 6 9 15 10 4 ...
##  $ capital.gain         : int  0 0 0 7688 0 0 0 3103 0 0 ...
##  $ capital.loss         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hours.per.week       : int  40 50 40 40 30 30 40 32 40 10 ...
##  $ Local-gov            : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ No-gain              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Private              : num  1 1 0 1 1 1 1 0 1 1 ...
##  $ Self-emp-inc         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Self-emp-not-inc     : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ State-gov            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Married-AF-spouse    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Married-civ-spouse   : num  0 1 1 1 0 0 0 1 0 1 ...
##  $ Married-spouse-absent: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Never-married        : num  1 0 0 0 1 1 1 0 1 0 ...
##  $ Separated            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Widowed              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Armed-Forces         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Craft-repair         : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ Exec-managerial      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Farming-fishing      : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ Handlers-cleaners    : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ Machine-op-inspct    : num  1 0 0 1 0 0 0 0 0 0 ...
##  $ Other-service        : num  0 0 0 0 0 1 0 0 1 0 ...
##  $ Priv-house-serv      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Prof-specialty       : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ Protective-serv      : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ Sales                : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ Tech-support         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Transport-moving     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Not-in-family        : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ Other-relative       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Own-child            : num  1 0 0 0 1 0 0 0 0 0 ...
##  $ Unmarried            : num  0 0 0 0 0 0 1 0 1 0 ...
##  $ Wife                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Asian-Pac-Islander   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Black                : num  1 0 0 1 0 0 1 0 0 0 ...
##  $ Other                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ White                : num  0 1 1 0 1 1 0 1 1 1 ...
##  $ Male                 : num  1 1 1 1 0 1 1 1 0 1 ...
##  $ other_countries      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Philippines          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ United-States        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ income               : Factor w/ 2 levels "<=50K",">50K": 1 1 2 2 1 1 1 2 1 1 ...